use std::io::BufferedReader; | |
use std::io::File; | |
use std::str::StrSlice; | |
fn main() { | |
let path = Path::new("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa"); | |
let mut file = BufferedReader::new(File::open(&path)); | |
let mut gc = 0i; | |
let mut at = 0i; | |
for line in file.lines() { | |
for c in line.unwrap().as_slice().chars() { | |
match c { | |
'G' => gc += 1, | |
'C' => gc += 1, | |
'A' => at += 1, | |
'T' => at += 1, | |
_ => {} | |
} | |
} | |
} | |
let gc_frac: f64 = (gc as f64) / ((at as f64) + (gc as f64)); | |
println!("GC fraction: {}", gc_frac) | |
} |
➜ gc-count time ./gc_count_rust
GC fraction: 0.376217
./gc_count_rust 5.75s user 0.00s system 99% cpu 5.753 total
Weird....
➜ gc-count time ./gc_count_c
37.6217301394
./gc_count_c 0.14s user 0.01s system 99% cpu 0.151 total
C still kix ass.... :D
C rocks ... as long as program safety is not that important :)
I think the use of lines is slowing you down. It has to ensure the stream is valid utf8. It might be better to loop over .bytes() instead.
The following is about 3× faster for me:
use std::io::File;
fn main() {
let path = Path::new("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa");
let mut file = File::open(&path);
let mut gc = 0i;
let mut at = 0i;
// large buffer of bytes
let mut buf = [0u8, .. 1 << 16];
loop {
let n = match file.read(buf) {
Ok(n) => n,
// EOF etc.
Err(_) => break
};
for b in buf.slice_to(n).iter() {
match *b {
b'G' => gc += 1,
b'C' => gc += 1,
b'A' => at += 1,
b'T' => at += 1,
_ => {}
}
}
}
let gc_frac: f64 = (gc as f64) / ((at as f64) + (gc as f64));
println!("GC fraction: {}", gc_frac)
}
(Unlike Go, Rust ensures that strings are valid UTF8: the validator shows up high the profiles when using lines
.)
@hounw: Cool, that's a 25% speedup of the fastest Go version I have on http://saml.rilspace.org/moar-languagez-gc-content-in-python-d-fpc-c-and-c ! :) Good job!
Hope to post updated benchmarks and graphs soon!
Aha, only I had forgot the check on whether each line starts with '>', which we have in https://gist.github.com/samuell/5591369 ... Do you think you could update your code with that, @hounw? (I'm unfortunately not versed in rust enough to know how to do it ...)
use std::io::File;
fn main() {
let path = Path::new("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa");
let mut file = File::open(&path);
let mut gc = 0i;
let mut at = 0i;
// large buffer of bytes
let mut buf = [0u8, .. 1 << 16];
let mut start_of_line = true;
let mut inside_comment = false;
loop {
let n = match file.read(buf) {
Ok(n) => n,
// EOF etc.
Err(_) => break
};
for b in buf.slice_to(n).iter() {
let is_newline = *b == b'\n';
if inside_comment {
if is_newline {
inside_comment = false;
}
} else {
match *b {
b'G' | b'C' => gc += 1,
b'A' | b'T' => at += 1,
b'>' if start_of_line => inside_comment = true,
_ => {}
}
}
start_of_line = is_newline;
}
}
let gc_frac: f64 = (gc as f64) / ((at as f64) + (gc as f64));
println!("GC fraction: {}", gc_frac)
}
By the way, it seems some programs in your blog post skip the first line, and some others don't handle >
line comments.
Did you catch the memory mapped version at https://gist.github.com/Blei/32d22fb92a3365da86b6?
@huonw: I think you're right about some unfortunate differences in the code examples. I really should compile an updated benchmark soon.
[samuel gc]$ rustc --opt-level 3 gc_count.rs [samuel gc]$ time ./gc_count GC fraction: 0.376217 real 0m0.392s user 0m0.383s sys 0m0.009s