defmodule ATGCCount do | |
def count(sequence), do: cnt(String.to_char_list(sequence),0,0) | |
def cnt([65|t],at,gc), do: cnt(t,at+1,gc) | |
def cnt([84|t],at,gc), do: cnt(t,at+1,gc) | |
def cnt([71|t],at,gc), do: cnt(t,at,gc+1) | |
def cnt([67|t],at,gc), do: cnt(t,at,gc+1) | |
def cnt([62|_],at,gc), do: {at,gc} | |
def cnt([],at,gc), do: {at,gc} | |
def cnt(_,0,0), do: {0,0} | |
def cnt([_|t], at, gc), do: cnt(t,at,gc) | |
end | |
defmodule GCCount do | |
def process do | |
filename = "chry.fa" | |
if File.exists?(filename) do | |
stream = File.stream!(filename, [:read_ahead, :raw], :line) | |
Enum.reduce stream, {0, 0}, fn(line, {at_acc, gc_acc}) -> | |
{at, gc} = ATGCCount.count(line) | |
{at_acc + at, gc_acc + gc} | |
end | |
end | |
end | |
def gc_ratio do | |
{at, gc} = process | |
case {gc, at} do | |
{0, 0} -> 0 | |
{at, gc} -> gc/(gc+at) | |
{_,_} -> 0 | |
end | |
end | |
end |
This comment has been minimized.
This comment has been minimized.
@behe Cool! In my experiment on saml.rilspace.org/moar-languagez-gc-content-in-python-d-fpc-c-and-c I put on the restriction to only read one line at a time (except for the second test in the end). Would be very interesting to see what performance we get without that optimization! |
This comment has been minimized.
This comment has been minimized.
All: For reference, the data file used here is available at: http://bit.ly/ychromo |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
I got this faster primarily by changing to reading bigger chunks of the file at a time, eg:
File.stream!(filename, [:read_ahead, :raw], @read_size)
The second slight increase that can only be observed when reading bigger chunks is by skipping the list conversion and doing a binary matching:
Thirdly I can cut down the total time by doing this asynchronously, see: https://gist.github.com/behe/b741c6ae7ff567b2d307
Lastly this seems wrong:
def cnt(_,0,0), do: {0,0}
I believe this is not needed and will incorrectly return a line starting with N as a {0,0} result even though it might include ATGC afterwards.
And then of course in the
gc_ratio
you have switched order of at & gc by mistake.Thanks for a fun Elixir excercise!