caulagi/01-counter

## 01-counter
$ cargo new words
$ cargo build --release

$ python benchmark.py

## benchmark.py
import os
import re
import time

from cffi import FFI
from collections import Counter

ffi = FFI()
ffi.cdef("""
    int most_common(const char *, int);
""")

C = ffi.dlopen("target/release/libcounter.dylib")


def benchmark_rs(path, n=10):
    path = bytes(path.encode('utf-8'))
    start = time.time()
    C.most_common(path, n)
    return time.time() - start


def words(text):
    return re.findall(r'\w+', text.lower())


def benchmark(path, n=10):
    start = time.time()
    with open(path) as fp:
        Counter(words(fp.read())).most_common(n)
    return time.time() - start


if __name__ == "__main__":
    time_rs = time_py = 0
    base_dir = 'data'
    for f in os.listdir(base_dir):
        path = os.path.join(base_dir, f)
        time_rs += benchmark_rs(path)
        time_py += benchmark(path)

    print("Time in rust: %s" % time_rs)
    print("Time in python: %s" % time_py)

## Cargo.toml
[package]
name = "words"
version = "0.1.0"
authors = ["Pradip Caulagi <caulagi@gmail.com>"]

[dependencies]
libc = "*"

[lib]
name = "counter"
crate-type = ["dylib"]

## lib.rs
extern crate libc;

use std::clone::Clone;
use std::collections::HashMap;
use std::ffi::CStr;
use std::fs::File;
use std::hash::Hash;
use std::io::Read;
use std::str;

use libc::c_char;

#[no_mangle]
pub extern "C" fn most_common(c_buf: *const c_char, n: i32) -> i32 {
    let buf = unsafe { CStr::from_ptr(c_buf).to_bytes() };
    let path = str::from_utf8(buf).unwrap();
    bucketize_words(path, n as usize)[0].1 as i32
}

/// Read the file indicated by path and return the most common
/// words in the file along with number of occurences
fn bucketize_words(path: &str, n: usize) -> Vec<(String, usize)> {
    let mut f = File::open(path).unwrap();
    let mut data = String::new();
    f.read_to_string(&mut data).unwrap();

    let mut bag = HashMap::new();
    for item in data.split_whitespace() {
        let count = bag.entry(item.to_lowercase()).or_insert(0);
        *count += 1;
    }

    n_most_common(bag, n)
}

/// Find the most common words in the bag based on number of occurrences
fn n_most_common<T>(bag: HashMap<T, usize>, n: usize) -> Vec<(T, usize)>
    where T: Eq + Hash + Clone
{
    let mut count_vec: Vec<_> = bag.into_iter().collect();
    count_vec.sort_by(|a, b| b.1.cmp(&a.1));
    count_vec.truncate(n);
    count_vec
}
	$ cargo new words
	$ cargo build --release

	$ python benchmark.py
	import os
	import re
	import time

	from cffi import FFI
	from collections import Counter

	ffi = FFI()
	ffi.cdef("""
	int most_common(const char *, int);
	""")

	C = ffi.dlopen("target/release/libcounter.dylib")


	def benchmark_rs(path, n=10):
	path = bytes(path.encode('utf-8'))
	start = time.time()
	C.most_common(path, n)
	return time.time() - start


	def words(text):
	return re.findall(r'\w+', text.lower())


	def benchmark(path, n=10):
	start = time.time()
	with open(path) as fp:
	Counter(words(fp.read())).most_common(n)
	return time.time() - start


	if __name__ == "__main__":
	time_rs = time_py = 0
	base_dir = 'data'
	for f in os.listdir(base_dir):
	path = os.path.join(base_dir, f)
	time_rs += benchmark_rs(path)
	time_py += benchmark(path)

	print("Time in rust: %s" % time_rs)
	print("Time in python: %s" % time_py)
	[package]
	name = "words"
	version = "0.1.0"
	authors = ["Pradip Caulagi <caulagi@gmail.com>"]

	[dependencies]
	libc = "*"

	[lib]
	name = "counter"
	crate-type = ["dylib"]
	extern crate libc;

	use std::clone::Clone;
	use std::collections::HashMap;
	use std::ffi::CStr;
	use std::fs::File;
	use std::hash::Hash;
	use std::io::Read;
	use std::str;

	use libc::c_char;

	#[no_mangle]
	pub extern "C" fn most_common(c_buf: *const c_char, n: i32) -> i32 {
	let buf = unsafe { CStr::from_ptr(c_buf).to_bytes() };
	let path = str::from_utf8(buf).unwrap();
	bucketize_words(path, n as usize)[0].1 as i32
	}

	/// Read the file indicated by path and return the most common
	/// words in the file along with number of occurences
	fn bucketize_words(path: &str, n: usize) -> Vec<(String, usize)> {
	let mut f = File::open(path).unwrap();
	let mut data = String::new();
	f.read_to_string(&mut data).unwrap();

	let mut bag = HashMap::new();
	for item in data.split_whitespace() {
	let count = bag.entry(item.to_lowercase()).or_insert(0);
	*count += 1;
	}

	n_most_common(bag, n)
	}

	/// Find the most common words in the bag based on number of occurrences
	fn n_most_common<T>(bag: HashMap<T, usize>, n: usize) -> Vec<(T, usize)>
	where T: Eq + Hash + Clone
	{
	let mut count_vec: Vec<_> = bag.into_iter().collect();
	count_vec.sort_by(\|a, b\| b.1.cmp(&a.1));
	count_vec.truncate(n);
	count_vec
	}