-
-
Save samuell/6f0ecbe4c5e88c04c387 to your computer and use it in GitHub Desktop.
use std::io::BufferedReader; | |
use std::io::File; | |
use std::str::StrSlice; | |
fn main() { | |
let path = Path::new("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa"); | |
let mut file = BufferedReader::new(File::open(&path)); | |
let mut gc = 0i; | |
let mut at = 0i; | |
for line in file.lines() { | |
for c in line.unwrap().as_slice().chars() { | |
match c { | |
'G' => gc += 1, | |
'C' => gc += 1, | |
'A' => at += 1, | |
'T' => at += 1, | |
_ => {} | |
} | |
} | |
} | |
let gc_frac: f64 = (gc as f64) / ((at as f64) + (gc as f64)); | |
println!("GC fraction: {}", gc_frac) | |
} |
C rocks ... as long as program safety is not that important :)
I think the use of lines is slowing you down. It has to ensure the stream is valid utf8. It might be better to loop over .bytes() instead.
The following is about 3× faster for me:
use std::io::File;
fn main() {
let path = Path::new("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa");
let mut file = File::open(&path);
let mut gc = 0i;
let mut at = 0i;
// large buffer of bytes
let mut buf = [0u8, .. 1 << 16];
loop {
let n = match file.read(buf) {
Ok(n) => n,
// EOF etc.
Err(_) => break
};
for b in buf.slice_to(n).iter() {
match *b {
b'G' => gc += 1,
b'C' => gc += 1,
b'A' => at += 1,
b'T' => at += 1,
_ => {}
}
}
}
let gc_frac: f64 = (gc as f64) / ((at as f64) + (gc as f64));
println!("GC fraction: {}", gc_frac)
}
(Unlike Go, Rust ensures that strings are valid UTF8: the validator shows up high the profiles when using lines
.)
@hounw: Cool, that's a 25% speedup of the fastest Go version I have on http://saml.rilspace.org/moar-languagez-gc-content-in-python-d-fpc-c-and-c ! :) Good job!
Hope to post updated benchmarks and graphs soon!
Aha, only I had forgot the check on whether each line starts with '>', which we have in https://gist.github.com/samuell/5591369 ... Do you think you could update your code with that, @hounw? (I'm unfortunately not versed in rust enough to know how to do it ...)
use std::io::File;
fn main() {
let path = Path::new("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa");
let mut file = File::open(&path);
let mut gc = 0i;
let mut at = 0i;
// large buffer of bytes
let mut buf = [0u8, .. 1 << 16];
let mut start_of_line = true;
let mut inside_comment = false;
loop {
let n = match file.read(buf) {
Ok(n) => n,
// EOF etc.
Err(_) => break
};
for b in buf.slice_to(n).iter() {
let is_newline = *b == b'\n';
if inside_comment {
if is_newline {
inside_comment = false;
}
} else {
match *b {
b'G' | b'C' => gc += 1,
b'A' | b'T' => at += 1,
b'>' if start_of_line => inside_comment = true,
_ => {}
}
}
start_of_line = is_newline;
}
}
let gc_frac: f64 = (gc as f64) / ((at as f64) + (gc as f64));
println!("GC fraction: {}", gc_frac)
}
By the way, it seems some programs in your blog post skip the first line, and some others don't handle >
line comments.
Did you catch the memory mapped version at https://gist.github.com/Blei/32d22fb92a3365da86b6?
@huonw: I think you're right about some unfortunate differences in the code examples. I really should compile an updated benchmark soon.
C still kix ass.... :D