Last active
December 20, 2016 20:17
-
-
Save caulagi/ec063cd338310dcf922de82dd43a074a to your computer and use it in GitHub Desktop.
A simple counter in Rust and Python integration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ cargo new words | |
$ cargo build --release | |
$ python benchmark.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import time | |
from cffi import FFI | |
from collections import Counter | |
ffi = FFI() | |
ffi.cdef(""" | |
int most_common(const char *, int); | |
""") | |
C = ffi.dlopen("target/release/libcounter.dylib") | |
def benchmark_rs(path, n=10): | |
path = bytes(path.encode('utf-8')) | |
start = time.time() | |
C.most_common(path, n) | |
return time.time() - start | |
def words(text): | |
return re.findall(r'\w+', text.lower()) | |
def benchmark(path, n=10): | |
start = time.time() | |
with open(path) as fp: | |
Counter(words(fp.read())).most_common(n) | |
return time.time() - start | |
if __name__ == "__main__": | |
time_rs = time_py = 0 | |
base_dir = 'data' | |
for f in os.listdir(base_dir): | |
path = os.path.join(base_dir, f) | |
time_rs += benchmark_rs(path) | |
time_py += benchmark(path) | |
print("Time in rust: %s" % time_rs) | |
print("Time in python: %s" % time_py) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "words" | |
version = "0.1.0" | |
authors = ["Pradip Caulagi <caulagi@gmail.com>"] | |
[dependencies] | |
libc = "*" | |
[lib] | |
name = "counter" | |
crate-type = ["dylib"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extern crate libc; | |
use std::clone::Clone; | |
use std::collections::HashMap; | |
use std::ffi::CStr; | |
use std::fs::File; | |
use std::hash::Hash; | |
use std::io::Read; | |
use std::str; | |
use libc::c_char; | |
#[no_mangle] | |
pub extern "C" fn most_common(c_buf: *const c_char, n: i32) -> i32 { | |
let buf = unsafe { CStr::from_ptr(c_buf).to_bytes() }; | |
let path = str::from_utf8(buf).unwrap(); | |
bucketize_words(path, n as usize)[0].1 as i32 | |
} | |
/// Read the file indicated by path and return the most common | |
/// words in the file along with number of occurences | |
fn bucketize_words(path: &str, n: usize) -> Vec<(String, usize)> { | |
let mut f = File::open(path).unwrap(); | |
let mut data = String::new(); | |
f.read_to_string(&mut data).unwrap(); | |
let mut bag = HashMap::new(); | |
for item in data.split_whitespace() { | |
let count = bag.entry(item.to_lowercase()).or_insert(0); | |
*count += 1; | |
} | |
n_most_common(bag, n) | |
} | |
/// Find the most common words in the bag based on number of occurrences | |
fn n_most_common<T>(bag: HashMap<T, usize>, n: usize) -> Vec<(T, usize)> | |
where T: Eq + Hash + Clone | |
{ | |
let mut count_vec: Vec<_> = bag.into_iter().collect(); | |
count_vec.sort_by(|a, b| b.1.cmp(&a.1)); | |
count_vec.truncate(n); | |
count_vec | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment