henfiber/compare_compilers.R

## compare_compilers.R
#####################
## Updated and combo benchmark
## Including :
##      R-benchmark-25.R from Simon Urbanek
##      Revolution Analytics benchmarks : http://www.revolutionanalytics.com/revolution-revor-enterprise-benchmark-details
##      Extra benchmarks (loess and eigen)
##      LAPACK benchmarks (QR,eigen,cholesky,svd,prcomp,balance,kappa,norm,solve)
## TODO:
##      add extra functions like dist() and cluster()  - also provided by gputools
##      separate the gpu from cpu benchmarks, and the analysis from the benchmark code
##      precompute the matrices used throughout the experiments


library(compiler)
library(methods)
set.seed(1)

##########################
# Helper functions
##########################

# return a string with found BLAS/LAPACK library filenames
get_numlib <- function(blas_fallback="libblas", lapack_fallback="liblapack") {
    if(exists("getMKLthreads", mode="function")) {
        numlib <- "MKL_MKL"
    } else {
        blas <- blas_fallback; lapack <- lapack_fallback

        cmd <- paste0("lsof -p ", Sys.getpid(), " 2>/dev/null |
                      grep 'lib.*blas\\|lib.*lapack\\|lib.*atlas' |
                      awk -v FS='/' '{$0=substr($NF,1,index($NF,\".\")-1)} 1'")
        p <- pipe(cmd)
        libs <- readLines(p); close(p)

        if(any(grepl("atlas", libs))) {
            str <- libs[grep("atlas", libs)]
            blas <- unlist(strsplit(str, split=".", fixed = T))[1]
            lapack <- blas
        }
        if(any(grepl("openblas", libs))) {
            str <- libs[grep("openblas", libs)]
            blas <- unlist(strsplit(str, split="-"))[1]
        }
        if(length(blas) == 0 || blas=="")   blas <- blas_fallback
        if(length(lapack) == 0 || lapack=="") lapack <- lapack_fallback
        numlib <- paste(blas, lapack, sep = "_")

        # Compiling the C function to set OpenBLAS number of threads for OpenMP version
        # needed to run the implicit parallelism test, otherwise it fails
        if(grepl("openblaso", blas)) {
            if("OpenBlasThreads" %in% rownames(installed.packages())) {
                library(OpenBlasThreads)
            } else {
                message("Compiling C inline function openblas.set.num.threads()")
                compile_openblas_num_threads()
            }
        }
    }
    numlib
}

# Get a unique hash for this host using Sys.info()
# The system host name can be found using this process:
    # First, this function checks the system environment variables HOST, HOSTNAME, and COMPUTERNAME.
    # Second, it checks Sys.info()["nodename"] for host name details.
    # Finally, it tries to query the system command uname -n.
build_sysinfo_hash <- function(){
    return(substr(digest::digest(Sys.info(), "md5"), 1, 5))
}

# build a benchmark name using r version, jit status and loaded math libraries
build_benchmark_name <- function(jit_level=0) {
    numlib <- tryCatch(get_numlib(),
                       error = function(e) warning(paste0("While trying to detect the math libs : ", e$message)))

    name <- paste( # R.version$os,
        as.character(getRversion()),
        paste0("j", jit_level),
        numlib,
        sep = "_")
    if(interactive()) {
        response <- readline(paste0('The automatic benchmark name is "', name, '". ',
                                    "Press enter to accept. OR type a new custom name :\n"))
        if(response != "") name <- response
    }
    name
}

# Compile inline C function to set OpenBLAS number of threads
# Use as: openblas.set.num.threads(4)
compile_openblas_num_threads <- function() {
    require(inline)
    openblas.set.num.threads <<- cfunction(signature(ipt="integer"),
        body = 'openblas_set_num_threads(*ipt);', language = "C", convention = ".C",
        otherdefs = c ('extern void openblas_set_num_threads(int);'),
        libargs = c ('-L/usr/lib64/ -lblas'))
}

##########################
# Measurement functions
##########################

init_measurements <- function() {
    if(!exists("measurements") || is.null(measurements)) {
        measurements <<- data.frame(timestamp=format(Sys.time(), "%Y%m%dT%H%M%S"))
    }
}

add_measurement <- function(name,value) {
    measurements[1, name] <<- value
}

# Save measuremt in an .rds file
save_measurements <- function() {
    if(is.null(measurements)) {
        return(2)
    }
    if(!dir.exists("benchmarks_results")) {
        stop("Cannot find the benchmarks results folder. chdir to the appropriate location")
    }
    if(is.null(benchmark_name)) {
        if(interactive())
            benchmark_name <<- readline("Benchmark name? (unique - will override identical benchmark files) : ")
        else
            benchmark_name <<- build_benchmark_name(0)
    }

    if(!exists("host_hash")) {
        host_hash <- build_sysinfo_hash()
        if(interactive()) {
            response <- readline(paste0("Type a custom hostname if you want here. Otherwise the automatically generated hash ",
                                         host_hash,
                                         " will be used :  "))
            if(nchar(response)) host_hash <- response
        }
    }

    saveRDS(measurements, file=paste0("benchmarks_results/bench_", benchmark_name, "_",
                                      format(Sys.time(), "%Y%m%dT%H%M%S"), "_",
                                      host_hash,
                                      ".rds"))
}


# Run and measure a benchmarking function/expression
run_and_measure <- function(name, expr, runs=1){
    #expr <- substitute(expr) # Prevent early evaluation
    init_measurements()

    cat("###", name, "\n\n")

    timing <- unclass(system.time(expr))[1:3]

    add_measurement(paste0("cput_", name), timing[1] + timing[2])
    add_measurement(paste0("clock_", name), timing[3])

    # Print the timing
    print(timing)
    cat("\n")
}


# MAIN function - Run a whole experiment
run_experiment <- function(expr, jit_level=0) {
    library(compiler)
    enableJIT(jit_level)

    # Use the same seed across runs
    set.seed(1)
    gc()

    # Initialize measurements data frame
    measurements <<- data.frame(timestamp=format(Sys.time(), "%Y%m%dT%H%M%S"))

    # set the benchmark name
    benchmark_name <<- build_benchmark_name(jit_level)
    rownames(measurements)[1] <<- benchmark_name

    # Run the experiment
    cat("# EXPERIMENT:", benchmark_name, "\n\n")
    expr
    cat("\n")
}


#####################################
# Loading and ranking experiments
#####################################

# Load measurements saved in .rds files
load_measurements <- function(pattern='bench_.*', this_engine=TRUE){
    if(!dir.exists("benchmarks_results")) {
        stop("Cannot find the benchmarks results folder. chdir to the appropriate location")
    }

    if(this_engine)
        pattern <- paste0(pattern, build_sysinfo_hash(), "[.]+rds")
    else
        pattern <- paste0(pattern, "[.]+rds")

    found <- list.files(path = "benchmarks_results", pattern = pattern)
    if(length(found) > 0) {
        past_measurements <<- data.frame(timestamp=format(Sys.time(), "%Y%m%dT%H%M%S"), stringsAsFactors = F)
        for (f in found) {
            cur <- readRDS(paste0("benchmarks_results/", f))
            name <- rownames(cur)
            if(!this_engine) name <- paste0(name, "_", sub("(.*_)([A-Za-z0-9]+)[.]+rds", "\\2", f))
            cur <- cbind(data.frame(name=name, stringsAsFactors = F), cur)
            past_measurements <<- merge(past_measurements, cur, all=T)
        }
        past_measurements <<- past_measurements[complete.cases(past_measurements), ]
    } else {
        message("No past experiments have been found.")
    }
}

load_memory_measurements <- function(pattern='*.csv') {
    if(!dir.exists("benchmarks_results")) {
        stop("Cannot find the benchmarks results folder. chdir to the appropriate location")
    }
    past_measurements <<- do.call("rbind", lapply(list.files(path = "benchmarks_results",
                                pattern = pattern), function(f) {
        d <- read.csv(paste0("benchmarks_results/", f), stringsAsFactors = F)
        values <- t(d$mem_kb)
        colnames(values) <- t(d$func)
        rownames(values) <- f
        data.frame(values)
    }))
}


# Annotate experiments extracting info from the experiment name
annotate_experiments <- function() {
    if(!exists("past_measurements"))
        load_measurements()

    m <- past_measurements
    single_threaded <- c("libblas", "liblapack", "libsatlas", "libopenblas")
    openblas_versions <- c("libopenblas", "libopenblasp", "libopenblaso")
    atlas_versions <- c("libtatlas", "libsatlas")
    mkl_versions <- c("MKL")

    # Parse information from experiment name
    s <- strsplit(m$name, '_')
    d <- do.call("rbind", lapply(s, function(l){
        data.frame( r_version = l[1],
                    jit = substr(l[2], 2, 2),
                    blas = l[3],
                    lapack = if(length(l)>3) l[4] else l[3]
        )
    }))
    d$blas_opt = as.factor(ifelse(d$blas == "libblas", "default", "optimized"))
    d$lapack_opt = as.factor(ifelse(d$lapack == "liblapack", "default", "optimized"))
    d$blas_threaded = as.factor(ifelse(d$blas %in% single_threaded, FALSE, TRUE))
    d$lapack_threaded = as.factor(ifelse(d$lapack %in% single_threaded, FALSE, TRUE))
    d$blas_family = as.factor(ifelse(d$blas %in% openblas_versions, "openblas",
                              ifelse(d$blas %in% atlas_versions, "atlas",
                              ifelse(d$blas %in% mkl_versions, "mkl", "default"))))

    (m <- cbind(d, m))
}


# Rank experiments - according to different scores
# Overall/clock-time means, most wins, highest ratios etc.
rank_experiments <- function() {
    if(!exists("past_measurements"))
        load_measurements()

    # Index some categories of columns
    cols_numeric <- 3:ncol(past_measurements)
    cols_clock <- which(!grepl("^cput_", colnames(past_measurements)))
    cols_clock <- cols_clock[-c(1,2)]
    cols_cput <- which(grepl("^cput_", colnames(past_measurements)))
    cols_parallel <- which(grepl("_PARALLEL$", colnames(past_measurements)))
    cols_aggregated <-  which(grepl("_ALL$", colnames(past_measurements)))

    # We define geometric mean and trimmed geommetric mean
    gmean   <- function(x) {exp(mean(log(x)))}
    trgmean <- function(x) {sort(x); exp(mean(log(x[2:length(x)-1])))}

    # Rank by arithmetic means
    means <- rowMeans(past_measurements[, cols_numeric])
    print("Rank by overall arithmetic means")
    print(paste(past_measurements$name[order(means)], sort(means)))

    # Rank by clock-time means
    clock_means <- rowMeans(past_measurements[, cols_clock])
    print("Rank by clock-time means")
    print(paste(past_measurements$name[order(clock_means)], sort(clock_means)))

    # Rank by Geometric means
    geom_means <- apply(past_measurements[, cols_numeric], 1, gmean)
    print("Rank by overall geometric means")
    print(paste(past_measurements$name[order(geom_means)], sort(geom_means)))

    # Rank by Geometric clock-time means
    clock_geom_means <- apply(past_measurements[, cols_clock], 1, gmean)
    print("Rank by clock-time geometric means")
    print(paste(past_measurements$name[order(clock_geom_means)], sort(clock_geom_means)))

    # Rank by most wins
    wins <- apply(past_measurements[, cols_numeric], 2, function(x) which.min(x))
    wins <- data.frame(idx=names(table(wins)), wins=as.numeric(table(wins)), stringsAsFactors = F)
    # Add missing values, i.e. the experiments with no wins
    wins <- merge(data.frame(idx=1:nrow(past_measurements)), wins, all = T)
    wins$wins[is.na(wins$wins)] <- 0
    # Print
    print("Rank by most wins")
    print(paste(past_measurements[order(wins$wins, decreasing=T), ]$name, sort(wins$wins, decr=T)))

    # Rank by most clock-time wins
    wins <- apply(past_measurements[, cols_clock], 2, function(x) which.min(x))
    wins <- data.frame(idx=names(table(wins)), wins=as.numeric(table(wins)), stringsAsFactors = F)
    # Add missing values, i.e. the experiments with no wins
    wins <- merge(data.frame(idx=1:nrow(past_measurements)), wins, all = T)
    wins$wins[is.na(wins$wins)] <- 0
    # Print
    print("Rank by most clock-time wins")
    print(paste(past_measurements[order(wins$wins, decreasing=T), ]$name, sort(wins$wins, decr=T)))


    # Rank by most impressive win!
    best <- apply(past_measurements[, cols_clock], 2, function(x) which.min(x))
    best <- data.frame(idx=names(best), name=past_measurements$name[best], stringsAsFactors = F)
    best$ratio <- apply(past_measurements[, cols_clock], 2, function(x) max(x)/min(x))
    best <- best[order(best$ratio, decreasing = T), ]
    print("Rank by the highest speedup/ratio in a single score")
    print(best)


    # Rank by the most marginal winner! (the highest difference from the second)
    best_diff <- do.call("rbind", apply(past_measurements[, cols_clock], 2, function(x) {
        best <- which.min(x)
        x <- sort(x)
        data.frame(id=best, ratio=x[2]/x[1], stringsAsFactors = F)
    }))
    best_diff$name <- past_measurements$name[best_diff$id]
    best_diff <- best_diff[order(best_diff$ratio, decreasing = T), ]
    print("Rank by the highest marginal difference from the second in a single score")
    print(best_diff)


    # Rank by highest parallelization ratio - means_cput / means_clock
    cols <- colnames(past_measurements)
    cols_cput_paired <- setdiff(setdiff(cols_cput, cols_parallel), cols_aggregated)
    cols_clock_paired <- which(cols %in% sub("cput_", "clock_", cols[cols_cput_paired]))
    cput_clock_ratios <- rowMeans(past_measurements[, cols_cput_paired]) /
                            rowMeans(past_measurements[, cols_clock_paired])

    print("Rank by highest clock-time/cpu-time ratio")
    print(paste(past_measurements$name[order(cput_clock_ratios, decreasing = T)],
                sort(cput_clock_ratios, decreasing = T)))


    # Rank by overall scaled arithmetic means of CPU times
    clock_scaled_means <- rowMeans(scale(past_measurements[, cols_cput], center = F))
    print("Rank by overall scaled CPU times arithmetic means")
    print(paste(past_measurements$name[order(clock_scaled_means)], sort(clock_scaled_means)))


    # Rank by overall scaled arithmetic means of clock times
    clock_scaled_means <- rowMeans(scale(past_measurements[, cols_clock], center = F))
    print("Rank by overall scaled clock-times arithmetic means")
    print(paste(past_measurements$name[order(clock_scaled_means)], sort(clock_scaled_means)))


    # Rank by overall scaled geometric means (trimmed) of clock times
    clock_scaled_gmeans <- apply(scale(past_measurements[, cols_clock], center = F),
                                 1, trgmean)
    print("Rank by overall scaled clock-times (trimmed) geometric means")
    print(paste(past_measurements$name[order(clock_scaled_gmeans)], sort(clock_scaled_gmeans)))

}

top_experiments <- function(n_best=8) {
    if(!exists("past_measurements"))
        load_measurements()

    cols_numeric <- 3:ncol(past_measurements)
    cols_clock <- which(!grepl("^cput_", colnames(past_measurements)))
    cols_clock <- cols_clock[-c(1,2)]

    # Rank by most clock-time wins
    wins <- apply(past_measurements[, cols_clock], 2, function(x) which.min(x))
    wins <- data.frame(idx=names(table(wins)), wins=as.numeric(table(wins)), stringsAsFactors = F)
    # Add missing values, i.e. the experiments with no wins
    wins <- merge(data.frame(idx=1:nrow(past_measurements)), wins, all = T)
    wins$wins[is.na(wins$wins)] <- 0
    head(past_measurements[order(wins$wins, decreasing=T), ]$name, n_best)
}


plot_experiments <- function(){
    library(ggplot2)
    library(reshape2)
    if(!exists("past_measurements"))
        load_measurements()

    # the first column is a timestamp, the second an id of the experiment
    measure.vars <- colnames(past_measurements)[-c(1,2)]

    # define the attribute columns
    ids <- c("name", "blas", "lapack", "jit",
             "r_version", "blas_opt", "lapack_opt",
             "blas_threaded", "lapack_threaded")

    # Annotate experiments with metadata about the math libraries, r version and jit level used
    m <- annotate_experiments()


    # columns with cpu-time measurements
    cols_cput  <- colnames(m)[which(grepl("^cput_", colnames(m)))]
    # columns with geometric means (cloc-time) measurements
    cols_gmean <- colnames(m)[which(grepl("^trimgmean_", colnames(m)))]
    # columns with aggregated clock-time measurements
    cols_total <- colnames(m)[which(grepl("^clock.*_ALL$|^clock_PARALLEL$", colnames(m)))]
    # columns with clock-time measurements
    cols_clock <- colnames(m)[which(grepl("^clock_|^trimgmean_", colnames(m)))]
    cols_clock <- setdiff(cols_clock, cols_total)


    # Convert measurements to long format
    m_melt <- melt(m, id.vars = ids,
              measure.vars = measure.vars)
    m_melt$measurement <- as.character(m_melt$variable)
    m_melt$value <- as.numeric(m_melt$value)


    # Plot single columns
    qplot(name, clock_PARALLEL, data=m) + coord_flip()


    # Plot multiple measurements
    measurements_total <- subset(m_melt, measurement %in% cols_total)
    measurements_clock <- subset(m_melt, measurement %in% cols_clock)
    measurements_clock_threaded <- subset(measurements_clock, blas_threaded == TRUE)
    measurements_openblasp <- subset(measurements_clock, blas == "libopenblasp")
    measurements_clock_top <- subset(measurements_clock, name %in% top_experiments())


    # Line charts
    g <- ggplot(measurements_total, aes(x=measurement, y=value, color=name, group=name)) +
        scale_fill_brewer(type="qual", palette = 1) +
        theme(axis.text.x = element_text(angle=90))
    g + geom_line(data=measurements_clock_threaded, position = "jitter")


    # Bar charts
    g <- ggplot(measurements_total, aes(x=measurement, y=value, fill=name)) +
        scale_fill_brewer(type="qual")

    # Bar chart of cols_total
    g + geom_bar(data=measurements_total, stat="identity", position="dodge")

    # Bar chart of cols_clock
    g + geom_bar(data=measurements_clock, stat="identity", position="dodge") + coord_flip()

    # Bar chart of cols_clock for threaded libraries
    g + geom_bar(data=measurements_clock_threaded, stat="identity", position="dodge") + coord_flip()

    # Bar chart of cols_clock for topN experiments across other settings
    g + geom_bar(data=measurements_clock_top, stat="identity", position="dodge") + coord_flip()
}


##########################################
## Benchmark functions
##########################################

## R Benchmark 2.5 ##
## adapted as a function
# http://r.research.att.com/benchmarks/R-benchmark-25.R
# R Benchmark 2.5 (06/2008) [Simon Urbanek]
# version 2.5: scaled to get roughly 1s per test, R 2.7.0 @ 2.6GHz Mac Pro
# R Benchmark 2.4 (06/2008) [Simon Urbanek]
# version 2.4 adapted to more recent Matrix package
# R Benchmark 2.3 (21 April 2004)
# Warning: changes are not carefully checked yet!
# version 2.3 adapted to R 1.9.0
# Many thanks to Douglas Bates (bates@stat.wisc.edu) for improvements!
# version 2.2 adapted to R 1.8.0
# version 2.1 adapted to R 1.7.0
# version 2, scaled to get 1 +/- 0.1 sec with R 1.6.2
# using the standard ATLAS library (Rblas.dll)
# on a Pentium IV 1.6 Ghz with 1 Gb Ram on Win XP pro

# revised and optimized for R v. 1.5.x, 8 June 2002
# Requires additionnal libraries: Matrix, SuppDists
# Author : Philippe Grosjean
# eMail  : phgrosjean@sciviews.org
# Web    : http://www.sciviews.org
# License: GPL 2 or above at your convenience (see: http://www.gnu.org)
#
# Several tests are adapted from the Splus Benchmark Test V. 2
# by Stephan Steinhaus (stst@informatik.uni-frankfurt.de)
# Reference for Escoufier's equivalents vectors (test III.5):
# Escoufier Y., 1970. Echantillonnage dans une population de variables
# aleatoires réelles. Publ. Inst. Statis. Univ. Paris 19 Fasc 4, 1-47.
####
rbenchmark <- function(runs=3) {
    set.seed (1)
    runs <- runs                       # Number of times the tests are executed
    times <- rep(0, 15); dim(times) <- c(5,3)
    require(Matrix)         # Optimized matrix operations

    # Runif <- rMWC1019      # The fast uniform number generator
    Runif <- runif
    # If you don't have SuppDists, you can use: Runif <- runif
    # a <- rMWC1019(10, new.start=TRUE, seed=492166) # Init. the generator
    # Rnorm <- rziggurat     # The fast normal number generator
    # If you don't have SuppDists, you can use: Rnorm <- rnorm
    # b <- rziggurat(10, new.start=TRUE)     # Init. the generator
    Rnorm <- rnorm
    # remove("a", "b")

    options(object.size=100000000)

    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    cat("## Synthetic benchmarks (2.5)\n\n")

    cat(c("Number of times each test is run__________________________: ", runs))
    cat("\n\n")

    cat("### I. Matrix calculation\n\n")

    # (1)
    name = "MAT_NEW_T_DEF"

        cumulate <- numeric(5); a <- 0; b <- 0
        for (i in 1:runs) {
            invisible(gc())
            timing <- system.time({
                a <- matrix(Rnorm(2500*2500)/10, ncol=2500, nrow=2500);
                b <- t(a);
                dim(b) <- c(1250, 5000);
                a <- t(b)
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[1, 1] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Creation, transp., deformation of a 2500x2500 matrix (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (2)
    name = "MAT_POW_2500"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            a <- abs(matrix(Rnorm(2500*2500)/2, ncol=2500, nrow=2500));
            invisible(gc())
            timing <- system.time({
                b <- a^1000
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[2, 1] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("2500x2500 normal distributed random matrix ^1000____ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (3)
    name = "SORT_7M"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            a <- Rnorm(7000000)
            invisible(gc())
            timing <- system.time({
                b <- sort(a, method="quick")    # Sort is modified in v. 1.5.x
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[3, 1] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Sorting of 7,000,000 random values__________________ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (4)
    name = "CROSS_PROD_2800"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            a <- Rnorm(2800*2800); dim(a) <- c(2800, 2800)
            invisible(gc())
            timing <- system.time({
                b <- crossprod(a)               # equivalent to: b <- t(a) %*% a
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[4, 1] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("2800x2800 cross-product matrix (b = a' * a)_________ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (5)
    name = "LM_CUSTOM"

        cumulate <- numeric(5); c <- 0; qra <-0
        for (i in 1:runs) {
            a <- new("dgeMatrix", x = Rnorm(2000*2000), Dim = as.integer(c(2000,2000)))
            b <- as.double(1:2000)
            invisible(gc())
            timing <- system.time({
                c <- solve(crossprod(a), crossprod(a,b))
            })
            cumulate <- cumulate + timing

            # This is the old method
            # a <- Rnorm(600*600); dim(a) <- c(600,600)
            # b <- 1:600
            # invisible(gc())
            # timing <- system.time({
            #   qra <- qr(a, tol = 1e-7);
            #   c <- qr.coef(qra, b)
            #   #Rem: a little faster than c <- lsfit(a, b, inter=F)$coefficients
            # })[3]
            # cumulate <- cumulate + timing
        }
        remove("a", "b", "c", "qra")

        timing <- cumulate/runs
        times[5, 1] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Linear regr. over a 3000x3000 matrix (c = a \\ b')___ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()

    # Summary of Matrix calculation tests
    times[ , 1] <- sort(times[ , 1])
    cat("                   --------------------------------------------\n")
    cat(c("                   Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 1]))), "\n\n"))

    # Add the overall measurement
    add_measurement(paste0("trimgmean_", "MATRIX_CALC"), exp(mean(log(times[2:4, 1]))))


    cat("### II. Matrix functions\n\n")


    if (R.Version()$os == "Win32") flush.console()


    # (1)
    name = "FFT"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            a <- Rnorm(2400000)
            invisible(gc())
            timing <- system.time({
                b <- fft(a)
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[1, 2] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("FFT over 2,400,000 random values____________________ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (2)
    name = "EIGEN_600"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            a <- array(Rnorm(600*600), dim = c(600, 600))
            # Only needed if using eigen.Matrix(): Matrix.class(a)
            invisible(gc())
            timing <- system.time({
                b <- eigen(a, symmetric=FALSE, only.values=TRUE)$Value
                # Rem: on my machine, it is faster than:
                #        b <- La.eigen(a, symmetric=F, only.values=T, method="dsyevr")$Value
                #        b <- La.eigen(a, symmetric=F, only.values=T, method="dsyev")$Value
                #  b <- eigen.Matrix(a, vectors = F)$Value
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[2, 2] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])


    cat(c("Eigenvalues of a 640x640 random matrix______________ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (3)
    name = "DET_2500"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            a <- Rnorm(2500*2500); dim(a) <- c(2500, 2500)
            #Matrix.class(a)
            invisible(gc())
            timing <- system.time({
                #b <- determinant(a, logarithm=F)
                # Rem: the following is slower on my computer!
                # b <- det.default(a)
                b <- det(a)
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[3, 2] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Determinant of a 2500x2500 random matrix____________ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (4)
    name = "CHOL_3K"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            a <- crossprod(new("dgeMatrix", x = Rnorm(3000*3000),
                               Dim = as.integer(c(3000, 3000))))
            invisible(gc())
            #a <- Rnorm(900*900); dim(a) <- c(900, 900)
            #a <- crossprod(a, a)
            timing <- system.time({
                b <- chol(a)
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[4, 2] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])


    cat(c("Cholesky decomposition of a 3000x3000 matrix________ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (5)
    name = "INVERSE_1600"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            a <- new("dgeMatrix", x = Rnorm(1600*1600), Dim = as.integer(c(1600, 1600)))
            invisible(gc())
            # a <- Rnorm(400*400); dim(a) <- c(400, 400)
            timing <- system.time({
                # b <- qr.solve(a)
                # Rem: a little faster than
                b <- solve(a)
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[5, 2] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Inverse of a 1600x1600 random matrix________________ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # Total time for Matrix functions
    times[ , 2] <- sort(times[ , 2])
    cat("                      --------------------------------------------\n")
    cat(c("                      Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 2]))), "\n\n"))

    # Add the overall measurement
    add_measurement(paste0("trimgmean_", "MATRIX_FUNC"), exp(mean(log(times[2:4, 2]))))


    cat("### III. Programmation\n\n")


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (1)
    name = "FIBO_3.5M"

        cumulate <- numeric(5); a <- 0; b <- 0; phi <- 1.6180339887498949
        for (i in 1:runs) {
            a <- floor(Runif(3500000)*1000)
            invisible(gc())
            timing <- system.time({
                b <- (phi^a - (-phi)^(-a))/sqrt(5)
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b", "phi")

        timing <- cumulate/runs
        times[1, 3] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])


    cat(c("3,500,000 Fibonacci numbers calculation (vector calc)(sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (2)
    name = "HILBERT_3K"

        cumulate <- numeric(5); a <- 3000; b <- 0
        for (i in 1:runs) {
            invisible(gc())
            timing <- system.time({
                b <- rep(1:a, a); dim(b) <- c(a, a);
                b <- 1 / (t(b) + 0:(a-1))
                # Rem: this is twice as fast as the following code proposed by R programmers
                # a <- 1:a; b <- 1 / outer(a - 1, a, "+")
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b")

        timing <- cumulate/runs
        times[2, 3] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Creation of a 3000x3000 Hilbert matrix (matrix calc) (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (3)
    name = "GCD_400K"

        cumulate <- numeric(5); c <- 0

        # Define the gcd function
        gcd2 <- function(x, y) {if (sum(y > 1.0E-4) == 0) x else {y[y == 0] <- x[y == 0]; Recall(y, x %% y)}}

        for (i in 1:runs) {
            a <- ceiling(Runif(400000)*1000)
            b <- ceiling(Runif(400000)*1000)
            invisible(gc())
            timing <- system.time({
                c <- gcd2(a, b)                            # gcd2 is a recursive function
            })
            cumulate <- cumulate + timing
        }
        remove("a", "b", "c", "gcd2")

        timing <- cumulate/runs
        times[3, 3] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Grand common divisors of 400,000 pairs (recursion)__ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (4)
    name = "TOEPLITZ_500"

        cumulate <- numeric(5); b <- 0
        for (i in 1:runs) {
            b <- rep(0, 500*500); dim(b) <- c(500, 500)
            invisible(gc())
            timing <- system.time({
                # Rem: there are faster ways to do this
                # but here we want to time loops (220*220 'for' loops)!
                for (j in 1:500) {
                    for (k in 1:500) {
                        b[k,j] <- abs(j - k) + 1
                    }
                }
            })
            cumulate <- cumulate + timing
        }
        remove("b", "j", "k")

        timing <- cumulate/runs
        times[4, 3] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Creation of a 500x500 Toeplitz matrix (loops)_______ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    # (5)
    name = "ESCOUFIER_45"

        cumulate <- numeric(5); p <- 0; vt <- 0; vr <- 0; vrt <- 0; rvt <- 0; RV <- 0; j <- 0; k <- 0;
        x2 <- 0; R <- 0; Rxx <- 0; Ryy <- 0; Rxy <- 0; Ryx <- 0; Rvmax <- 0

        # Calculate the trace of a matrix (sum of its diagonal elements)
        Trace <- function(y) {sum(c(y)[1 + 0:(min(dim(y)) - 1) * (dim(y)[1] + 1)], na.rm=FALSE)}

        for (i in 1:runs) {
            x <- abs(Rnorm(45*45)); dim(x) <- c(45, 45)
            invisible(gc())
            timing <- system.time({
                # Calculation of Escoufier's equivalent vectors
                p <- ncol(x)
                vt <- 1:p                                  # Variables to test
                vr <- NULL                                 # Result: ordered variables
                RV <- 1:p                                  # Result: correlations
                vrt <- NULL
                for (j in 1:p) {                           # loop on the variable number
                    Rvmax <- 0
                    for (k in 1:(p-j+1)) {                   # loop on the variables
                        x2 <- cbind(x, x[,vr], x[,vt[k]])
                        R <- cor(x2)                           # Correlations table
                        Ryy <- R[1:p, 1:p]
                        Rxx <- R[(p+1):(p+j), (p+1):(p+j)]
                        Rxy <- R[(p+1):(p+j), 1:p]
                        Ryx <- t(Rxy)
                        rvt <- Trace(Ryx %*% Rxy) / sqrt(Trace(Ryy %*% Ryy) * Trace(Rxx %*% Rxx)) # RV calculation
                        if (rvt > Rvmax) {
                            Rvmax <- rvt                         # test of RV
                            vrt <- vt[k]                         # temporary held variable
                        }
                    }
                    vr[j] <- vrt                             # Result: variable
                    RV[j] <- Rvmax                           # Result: correlation
                    vt <- vt[vt!=vr[j]]                      # reidentify variables to test
                }
            })
            cumulate <- cumulate + timing
        }
        remove("x", "p", "vt", "vr", "vrt", "rvt", "RV", "j", "k")
        remove("x2", "R", "Rxx", "Ryy", "Rxy", "Ryx", "Rvmax", "Trace")

        timing <- cumulate/runs
        times[5, 3] <- timing[3]

        # Add the measurement
        add_measurement(paste0("cput_", name), timing[1] + timing[2])
        add_measurement(paste0("clock_", name), timing[3])

    cat(c("Escoufier's method on a 45x45 matrix (mixed)________ (sec): ", timing[3], "\n"))


    if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console()


    ## Total time for Programmation tests
    times[ , 3] <- sort(times[ , 3])
    cat("                  --------------------------------------------\n")
    cat(c("                  Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 3]))), "\n\n\n"))
    cat("\n\n")

    # Add the overall measurement
    add_measurement(paste0("trimgmean_", "PROGRAMMATION"), exp(mean(log(times[2:4, 3]))))


    #### TOTAL TIME
    cat("### Total time for synthetic tests\n\n")

    cat(c("Total time for all 15 tests_________________________ (sec): ", sum(times), "\n"))
    cat(c("Overall mean (sum of I, II and III trimmed means/3)_ (sec): ", exp(mean(log(times[2:4, ]))), "\n"))
    cat("\n\n")

    # Add the net time taken for all synthetic tests as a measurement
    add_measurement(paste0("net_sum_", "SYNTHETIC"), sum(times))
}


# run r benchmark 2.5
run_synthetic <- function() {
    run_and_measure("SYNTHETIC_ALL", rbenchmark(runs=8))
}


###############################################
## bench.R
## http://r.research.att.com/benchmarks/bench.R
###############################################

run_extras <- function() {
    set.seed (1)
    cat("\n")
    cat("## Extra benchmarks\n\n")
    # hilbert
    hilbert <- function(n) 1/(outer(seq(n),seq(n),"+")-1)
    X <- hilbert(1000)
    run_and_measure("HILBERT_1000", eigen(X))

    # loess
    loess.me <- function(n) {
        x<-rnorm(10^n); y<-rnorm(10^n); z<-rnorm(10^n)
        invisible(loess(z~x+y))
    }
    run_and_measure("LOESS_4", loess.me(4))
}


#####################################
## Revolution Analytics Benchmarks
##
#####################################

# Matrix multiplication
run_matmult <- function(m=5000, n=5000){
    cat("\n")
    A <- matrix (runif (m*n),m,n)
    run_and_measure("MATMUL_custom", sum(A%*%t(A)))
    run_and_measure("MATMUL_tcross", tcrossprod(A))
}

# Linear Discriminant Analysis
run_lda <- function(m=10000, n=1000, g=5) {
    cat("\n")
    require ('MASS')
    set.seed (1)
    A <- matrix (runif (m*n),m,n)
    A <- data.frame (A, fac=sample(LETTERS[1:g], m, replace=TRUE))
    k <- round (m/2)
    train <- sample(1:m, k)
    run_and_measure("LDA", L <- lda(fac ~., data=A, prior=rep(1,g)/g, subset=train))
}


# Cholesky Factorization
# Compute the Choleski factorization of a real symmetric positive-definite square matrix.
# we calculate the average of chol and chol2inv in a large matrix
# uses LAPACK routines DPOTRF and DPSTRF
run_cholesky_large <- function(n=7000) {
    cat("\n")
    A <- matrix (runif (n*n), n, n)
    B <- crossprod(A)
    run_and_measure("CHOL_7K", C <- chol(B, pivot = T))
    run_and_measure("CHOL2INV_7K", chol2inv(C))
}


# Singular Value Decomposition (SVD)
# uses LAPACK routines DGESDD and ZGESDD
run_svd <- function(m=10000, n=2000) {
    cat("\n")
    A <- matrix (runif (m*n),m,n)
    run_and_measure("SVD", S <- svd(A,nu=0,nv=0))
}


# Principal Components Analysis (prcomp)
# prcomp uses svd, so we expect that LAPACK is used
# princomp uses eigen, so we also expect that LAPACK is used
run_pca <- function(m=4000, n=2000) {
    A <- matrix (runif (m*n),m,n)
    run_and_measure("PRCOMP", P <- prcomp(A))
    run_and_measure("PRINCOMP", P <- princomp(A))
}


# QR decomposition
# using LAPACK routines DGEQP3 and ZGEQP3
run_qr <- function(m=7000, n=2000) {
    X <- matrix(runif(m*n), m, n)
    run_and_measure("QR_la", Y <- qr(X, LAPACK = T))
}

# Eigen
# Uses LAPACK routines DSYEVR, DGEEV, ZHEEV and ZGEEV
run_eigen <- function(m=3000) {
    hilbert <- function(n) 1/(outer(seq(n),seq(n),"+")-1)
    X <- hilbert(m)
    run_and_measure("EIGEN", eigen(X))
}

# Kappa
# Uses LAPACK routines DTRCON and ZTRCON
run_kappa <- function(m=2000) {
    X <- matrix(runif(m*m), m, m)
    run_and_measure("KAPPA", kappa(X, norm = "O", LINPACK=F))
}


# SOLVE
# Uses LAPACK routines DGESV and ZGESV
run_solve <- function(m=3000) {
    hilbert <- function(n) { i <- 1:n; 1 / outer(i - 1, i, "+") }
    X <- hilbert(m) + matrix(rnorm(m*m, 1), m)
    run_and_measure("SOLVE", solve(X))
}

# NORM
# Uses LAPACK routine DLANGE
run_norm <- function(m=7000) {
    hilbert <- function(n) { i <- 1:n; 1 / outer(i - 1, i, "+") }
    X <- hilbert(m)
    #nTyp <- eval(formals(base::norm)$type)
    nTyp <- c('M','1','O','I','F', 'E')
    run_and_measure("NORM", sapply(nTyp, norm, x = X))
}


# BALANCE matrix
# Uses LAPACK DGEBAL
run_balance <- function(m=10000) {
    X <- matrix(runif(m*m), m, m)
    run_and_measure("BALANCE", expm::balance(X))
}


##########################
## Implicit parallelism ##
##########################

# Load the parallel library and fix conflicts with MKL/OpenMP multithreaded libs
load_parallel <- function() {
    library(parallel)
    # Fix conflicts for the MKL and OpenBlas OpenMP multithreaded libraries
    if(exists("setMKLthreads", mode="function"))
        setMKLthreads(1)
    if(exists("openblas.set.num.threads", mode="function"))
        openblas.set.num.threads(1)
}

# Produce a matrix with "times" permutations of the "tests" vector
permute_tests <- function(tests, times=4) {
    if(times > 1) {
        set.seed(NULL)
        tests <- rbind(tests, rev(tests)) # adding reverse second row
        for (i in seq(length.out = times-2))
            tests <- rbind(tests, sample(tests[1, ], ncol(tests), replace = F))
    }
    tests
}


# Run the default benchmark in parallel
run_parallel <- function(cores=4) {
    load_parallel()

    set.seed (1)
    runs <- rep(2, cores)

    run_and_measure("PARALLEL", mclapply(runs, rbenchmark, mc.cores=cores))
}

# Run "cores" parallel threads for the same code (in the same order)
run_parallel_ident <- function(cores=4) {
    load_parallel()

    set.seed (1)
    runs <- rep(1, cores)

    bench <- function(runs=1) {
        funcs <- c("run_matmult", "run_qr", "run_norm", "run_extras")  # "run_solve"
        sapply(funcs, function(x) eval(get(x)()))
    }

    run_and_measure("PARALLEL_IDENTICAL", mclapply(runs, bench, mc.cores=cores))
}

# Run "cores" parallel threads in a permutted order
run_parallel_permuted <- function(cores=4) {
    load_parallel()

    set.seed (1)
    index <- 1:cores

    # reset output file
    system(": > /tmp/parbench")

    funcs <- c("run_matmult", "run_qr", "run_norm", "run_extras")  # "run_solve"
    funcs <- permute_tests(funcs, cores)

    bench <- function(index=1) {
        set.seed(NULL)
        cmd <- paste0("echo ", paste(funcs[index, ], collapse = " "), " >>/tmp/parbench")
        system(cmd)
        sapply(funcs[index,], function(x) eval(get(x)()))
    }

    run_and_measure("PARALLEL_PERMUTTED", mclapply(index, bench, mc.cores=cores))
}


##########################
## GPU tests            ##
##########################

# Matrix multiplication on GPU
# gpuMatMult is merely a wrapper for the CUBLAS cublasDgemm function.
run_matmult_gpu <- function(m=5000, n=5000){
    library("gputools")
    cat("\n")
    A <- matrix(runif (m*n),m,n)
    run_and_measure("MATMUL_GPU", gpuMatMult(A, t(A)))
    run_and_measure("MATMUL_tcross_GPU", gpuTcrossprod(A))

    # Another one to compare with CROSS_PROD_2800 (CPU)
    A <- matrix (runif(2800*2800),2800,2800)
    run_and_measure("CROSS_PROD_2800_GPU", gpuCrossprod(A))
}

# QR decomposition on GPU
# gpuQR estimates the QR decomposition for a matrix using column pivoting and householder matrices.
run_qr_gpu <- function(m=7000, n=2000) {
    library("gputools")
    X <- matrix(runif(m*n), m, n)
    run_and_measure("QR_GPU", gpuQr(X))
}

# Inverse of a Matrix using GPU
run_inverse_gpu <- function(m=1600, n=1600) {
    library("gputools")
    A <- matrix (runif (m*n),m,n)
    run_and_measure("INVERSE_1600_GPU", b <- gpuSolve(A))
}


# Custom linear model impl. using gpu functions
run_custom_lm_gpu <- function(m=2000, n=2000) {
    library("gputools")
    A <- matrix (runif (m*n),m,n)
    b <- as.double(1:m)
    run_and_measure("LM_CUSTOM_GPU", c <- gpuSolve(gpuCrossprod(A), gpuCrossprod(A,b)))
}


# Run a sort task in GPU using the Rth:rthsort() function
run_sort_gpu <- function(m=7000000) {
    library("Rth")
    a <- runif(m)
    run_and_measure("SORT_7M_GPU", b <- rthsort(a))
}

# Parallel running of matrix multiplication in the GPU
run_parallel_gpu <- function(reps=2) {
    library(parallel)
    set.seed (1)
    cl <- makeForkCluster(2)
    run_and_measure("GPU_PARALLEL", clusterCall(cl, function() run_matmult_gpu(2000,2000)))
    stopCluster(cl)
}

# Run shipped demos
run_demos <- function() {
    # demos <- unclass(demo(package="stats"))$results[,3]
    demos <- c("glm.vr", "lm.glm", "nlm", "smooth")
    run_and_measure("DEMOS", sapply(demos, function(x) demo(x, package="stats",
                                                character.only = T, ask = FALSE)))
}

######################################
## Batch running tests

# LAPACK-using methods tests
# functions:
#   eigen, qr, svd, chol, chol2inv, solve, det (uses QR)
#   kappa, rcond (similar to kappa), norm, expm::balance
run_lapack <- function(m=10000, n=5000) {
    set.seed(1)
    run_and_measure("LAPACK_ALL", {
        run_qr()
        run_eigen()
        run_norm()
        run_cholesky_large()
        run_svd()
        run_balance()
        run_pca()
        run_kappa()
        run_solve()
    })
}


# We exclude the implicit parallelism test because some threaded BLAS configuration produce segfaults
run_safe <- function(jit_level=0) {
    run_and_measure("TESTS_SAFE_ALL", {
        run_synthetic()
        run_extras()
        run_matmult()
        run_lda()
        run_lapack()
    })
}

run_gpu <- function() {
    run_matmult_gpu()
    run_qr_gpu
    run_inverse_gpu()
    run_custom_lm_gpu()
}

run_all <- function() {
    run_and_measure("TESTS_ALL", {
        run_safe()
        run_parallel()
    })
}