Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@behe
Forked from samuell/gc_count.exs
Last active November 11, 2016 09:38
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save behe/b741c6ae7ff567b2d307 to your computer and use it in GitHub Desktop.
Save behe/b741c6ae7ff567b2d307 to your computer and use it in GitHub Desktop.
defmodule ATGCCount do
def count(sequence), do: cnt(String.to_char_list(sequence),0,0)
def cnt([65|t],at,gc), do: cnt(t,at+1,gc)
def cnt([84|t],at,gc), do: cnt(t,at+1,gc)
def cnt([71|t],at,gc), do: cnt(t,at,gc+1)
def cnt([67|t],at,gc), do: cnt(t,at,gc+1)
def cnt([62|_],at,gc), do: {at,gc}
def cnt([],at,gc), do: {at,gc}
# def cnt(_,0,0), do: {0,0}
def cnt([_|t], at, gc), do: cnt(t,at,gc)
end
defmodule AsyncATGCCount do
def count do
receive do
{sender, sequence} ->
send sender, { :ok, _count(sequence, 0, 0) }
end
count
end
def _count(<<?A, t :: binary>>, at, gc), do: _count(t, at+1, gc)
def _count(<<?T, t :: binary>>, at, gc), do: _count(t, at+1, gc)
def _count(<<?G, t :: binary>>, at, gc), do: _count(t, at, gc+1)
def _count(<<?C, t :: binary>>, at, gc), do: _count(t, at, gc+1)
def _count(<<?>, _ :: binary>>, at, gc), do: { at, gc }
def _count("", at, gc), do: { at, gc }
def _count(<<_, t :: binary>>, at, gc), do: _count(t, at, gc)
end
defmodule GCCount do
def process(filename) do
if File.exists?(filename) do
map File.stream!(filename, [:read_ahead, :raw], :line)
end
end
def map(stream) do
Enum.reduce stream, {0, 0}, fn(line, {at_acc, gc_acc}) ->
{at, gc} = ATGCCount.count(line)
{at_acc + at, gc_acc + gc}
end
end
def gc_ratio do
{at, gc} = process "chry.fa"
case {at, gc} do
{0, 0} -> 0
{at, gc} -> gc/(at+gc)
end
end
end
defmodule AsyncGCCount do
@read_size 1_00_000
@processes 10
@pids Enum.reduce 1..@processes, [], fn(_, pids) ->
[spawn(AsyncATGCCount, :count, [])] ++ pids
end
def process(filename) do
if File.exists?(filename) do
map File.stream!(filename, [:read_ahead, :raw], @read_size)
end
end
def map(stream) do
Stream.cycle(@pids) |>
Stream.zip(stream) |>
Enum.each fn({pid, line}) ->
send pid, {self(), line}
end
collect {0, 0}
end
def collect {ac,gt} do
receive do
{:ok, atgc} ->
collect {ac + elem(atgc,0), gt + elem(atgc, 1)}
after
10 -> {ac,gt}
end
end
def gc_ratio do
{at, gc} = process "chry.fa"
case {at, gc} do
{0, 0} -> 0
{at, gc} -> gc/(at+gc)
end
end
end
ExUnit.start
defmodule CountTest do
use ExUnit.Case
test "can count AT/GC ratio" do
assert GCCount.map(["ATGCAT"]) == {4, 2}
end
test "ignores N from ratio" do
assert GCCount.map(["AT","GC","NN","AT"]) == {4,2}
end
test "counts ATGC after N" do
assert GCCount.map(["NATGCAT"]) == {4,2}
end
test "ignores > lines from ratio" do
assert GCCount.map(["> ignore me","ATGCAT"]) == {4,2}
end
test "3M sync" do
{time, result} = :timer.tc(GCCount, :process, ["chry3M.fa"])
IO.puts "3M sync #{time/1_000_000}"
assert result == {80115, 44757}
end
test "everything sync" do
{time, result} = :timer.tc(GCCount, :process, ["chry.fa"])
IO.puts "all sync #{time/1_000_000}"
assert result == {5352980, 3228502}
end
end
defmodule AsyncCountTest do
use ExUnit.Case
test "can count AT/GC ratio" do
assert AsyncGCCount.map(["ATGCAT"]) == {4, 2}
end
test "ignores N from ratio" do
assert AsyncGCCount.map(["AT","GC","NN","AT"]) == {4,2}
end
test "counts ATGC after N" do
assert AsyncGCCount.map(["NATGCAT"]) == {4,2}
end
test "ignores > lines from ratio" do
assert AsyncGCCount.map(["> ignore me","ATGCAT"]) == {4,2}
end
test "3M async" do
{time, result} = :timer.tc(AsyncGCCount, :process, ["chry3M.fa"])
IO.puts "3M async #{time/1_000_000}"
assert result == {80115, 44757}
end
test "everything async" do
{time, result} = :timer.tc(AsyncGCCount, :process, ["chry.fa"])
IO.puts "all async #{time/1_000_000}"
assert result == {5352980, 3228502}
end
end
@behe
Copy link
Author

behe commented Oct 3, 2014

The example file chry.fa is from ftp://ftp.ensembl.org/pub/release-67/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa.gz­ and chry3M.fa is a 3Mb chunk of that file created with head -c $((1024*3000)) chry.fa > chry3M.fa

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment