anowell/benchmarks

## benchmarks
(Benchmark disclaimers apply. Don't trust the numbers without double checking my code.)

Scala (functional): 697,099 ns/iter
Scala (for loop):    20,467 ns/iter

Rust (functional):    4,984 ns/iter
Rust (for loop):      4,983 ns/iter // Note: this required adding assertions to prevent bounds checking

C (for loop):         6,172 ns/iter // Note: this used -O2 optimization

## zms.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

uint zms(uint *a, uint *b, uint aLen, uint bLen) {
    uint sum = 0;
    uint minLen = (aLen > bLen) ? aLen : bLen;

    for(int i=0; i < minLen; i++) {
        sum += (a[i] * b[i]);
    }
    return sum;
}

unsigned long get_time() {
    timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return ts.tv_sec * 1e9 + ts.tv_nsec;
}

int main() {
    uint *these, *those;
    unsigned long start, stop;

    these = (uint*) malloc (10000 * sizeof(uint));
    those = (uint*) malloc (10000 * sizeof(uint));
    for(int i=0; i<10000; i++) {
        these[i] = i%997;
        those[i] = i%97;
    }

    // Storing sum to keep loop from being optimized away
    uint sum = 0;
    start = get_time();
    for(int i=0; i<1000; i++) {
        sum = zms(these, those, 10000, 10000);
    }
    stop = get_time();

    printf("sum=%d\n", sum);
    printf("%d ns/iter", (stop-start)/1000);
}

## zms.rs
#![feature(test)]

extern crate test;

pub fn zms(a: &[u32], b: &[u32]) -> u32 {
    a.iter()
        .zip(b)
        .map(|(i,j)| i*j)
        .sum()
}

pub fn zms_for_loop(a: &[u32], b: &[u32]) -> u32 {
    let mut sum = 0;
    let min_len = ::std::cmp::min(a.len(), b.len());

    // assert less than length to force LLVM to optimize out the bounds check
    // while still avoiding the unsafe block to use .get_unchecked(i)
    // otherwise the bounds checks ends up being ~40% of the tight loop
    assert!(min_len - 1 < a.len());
    assert!(min_len - 1 < b.len());

    for i in 0..min_len {
        sum += a[i] * b[i];
    }
    sum
}

#[cfg(test)]
mod tests {
    use test::Bencher;
    use super::*;

    fn prep() -> (Vec<u32>, Vec<u32>) {
        let mut these = vec![];
        let mut those = vec![];
        let count = 10000;
        for i in 0..count {
            these.push(i%997);
            those.push(i%97);
        }
        (these, those)
    }

    #[bench]
    fn bench_zms(b: &mut Bencher) {
        let (these, those) = prep();
        b.iter(|| zms(&these, &those) );
    }

    #[bench]
    fn bench_zms_for_loop(b: &mut Bencher) {
        let (these, those) = prep();
        b.iter(|| zms_for_loop(&these, &those));
    }
}

## zms.scala
var these: Array[Int] = Array();
var those: Array[Int] = Array();

// This is not efficient, but who cares, we aren't benching this step
for(i <- 0 until 10000) { these = these :+ i%997; those = those :+ i%97 }

def zms(a: Array[Int], b: Array[Int]) = a.zip(b).map{ case (i,j) => i*j }.sum

def zmsLoop(a: Array[Int], b: Array[Int]) = {
    var sum=0
    val min_len = math.min(a.length, b.length)
    for(i <- 0 until min_len) {
        sum += (a(i) * b(i))
    }
    sum
}

// Run zms 1000 times
val start = System.nanoTime; for(i <- 0 until 1000) { zms(these, those) }; val stop = System.nanoTime; val zmsDur = (stop-start)/1000

// Run zmsLoop 1000 times
val start = System.nanoTime; for(i <- 0 until 1000) { zmsLoop(these, those) }; val stop = System.nanoTime; val zmsLoopDur = (stop-start)/1000
	(Benchmark disclaimers apply. Don't trust the numbers without double checking my code.)

	Scala (functional): 697,099 ns/iter
	Scala (for loop): 20,467 ns/iter

	Rust (functional): 4,984 ns/iter
	Rust (for loop): 4,983 ns/iter // Note: this required adding assertions to prevent bounds checking

	C (for loop): 6,172 ns/iter // Note: this used -O2 optimization
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>

	uint zms(uint a, uint b, uint aLen, uint bLen) {
	uint sum = 0;
	uint minLen = (aLen > bLen) ? aLen : bLen;

	for(int i=0; i < minLen; i++) {
	sum += (a[i] * b[i]);
	}
	return sum;
	}

	unsigned long get_time() {
	timespec ts;
	clock_gettime(CLOCK_REALTIME, &ts);
	return ts.tv_sec * 1e9 + ts.tv_nsec;
	}

	int main() {
	uint these, those;
	unsigned long start, stop;

	these = (uint) malloc (10000 sizeof(uint));
	those = (uint) malloc (10000 sizeof(uint));
	for(int i=0; i<10000; i++) {
	these[i] = i%997;
	those[i] = i%97;
	}

	// Storing sum to keep loop from being optimized away
	uint sum = 0;
	start = get_time();
	for(int i=0; i<1000; i++) {
	sum = zms(these, those, 10000, 10000);
	}
	stop = get_time();

	printf("sum=%d\n", sum);
	printf("%d ns/iter", (stop-start)/1000);
	}
	#![feature(test)]

	extern crate test;

	pub fn zms(a: &[u32], b: &[u32]) -> u32 {
	a.iter()
	.zip(b)
	.map(\|(i,j)\| i*j)
	.sum()
	}

	pub fn zms_for_loop(a: &[u32], b: &[u32]) -> u32 {
	let mut sum = 0;
	let min_len = ::std::cmp::min(a.len(), b.len());

	// assert less than length to force LLVM to optimize out the bounds check
	// while still avoiding the unsafe block to use .get_unchecked(i)
	// otherwise the bounds checks ends up being ~40% of the tight loop
	assert!(min_len - 1 < a.len());
	assert!(min_len - 1 < b.len());

	for i in 0..min_len {
	sum += a[i] * b[i];
	}
	sum
	}

	#[cfg(test)]
	mod tests {
	use test::Bencher;
	use super::*;

	fn prep() -> (Vec<u32>, Vec<u32>) {
	let mut these = vec![];
	let mut those = vec![];
	let count = 10000;
	for i in 0..count {
	these.push(i%997);
	those.push(i%97);
	}
	(these, those)
	}

	#[bench]
	fn bench_zms(b: &mut Bencher) {
	let (these, those) = prep();
	b.iter(\|\| zms(&these, &those) );
	}

	#[bench]
	fn bench_zms_for_loop(b: &mut Bencher) {
	let (these, those) = prep();
	b.iter(\|\| zms_for_loop(&these, &those));
	}
	}
	var these: Array[Int] = Array();
	var those: Array[Int] = Array();

	// This is not efficient, but who cares, we aren't benching this step
	for(i <- 0 until 10000) { these = these :+ i%997; those = those :+ i%97 }

	def zms(a: Array[Int], b: Array[Int]) = a.zip(b).map{ case (i,j) => i*j }.sum

	def zmsLoop(a: Array[Int], b: Array[Int]) = {
	var sum=0
	val min_len = math.min(a.length, b.length)
	for(i <- 0 until min_len) {
	sum += (a(i) * b(i))
	}
	sum
	}

	// Run zms 1000 times
	val start = System.nanoTime; for(i <- 0 until 1000) { zms(these, those) }; val stop = System.nanoTime; val zmsDur = (stop-start)/1000

	// Run zmsLoop 1000 times
	val start = System.nanoTime; for(i <- 0 until 1000) { zmsLoop(these, those) }; val stop = System.nanoTime; val zmsLoopDur = (stop-start)/1000