-
-
Save mlin/574ffeffa7eb0db3e710 to your computer and use it in GitHub Desktop.
A taste of molecular phylogenetics in Julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Code | |
export T, dna, aa, codon61, codon64 | |
# a code is defined by an alphabet; indices is a reverse lookup from | |
# character to array index | |
type T | |
alphabet::Array{String,1} | |
indices::Dict{String,Integer} | |
T(chs) = begin | |
idxs = (String=>Integer)[chs[i] => i for i = 1:Base.length(chs)] | |
assert(length(idxs) == length(chs)) | |
new(chs,idxs) | |
end | |
end | |
# size of the code | |
import Base.length | |
length(c::T) = length(c.alphabet) | |
# DNA | |
dna = T(["A","G","C","T"]) | |
# amino acid sequences | |
aa = T(["A","R","N","D","C","Q","E","G","H","I", | |
"L","K","M","F","P","S","T","W","Y","V"]) | |
# compute codon64 | |
function to_codon(i) | |
n3 = i%4 | |
n2 = div(i,4)%4 | |
n1 = div(i,16) | |
dna.alphabet[n1+1]*dna.alphabet[n2+1]*dna.alphabet[n3+1] | |
end | |
codon64 = T([to_codon(i) for i in 0:63]) | |
# filter codon64 to get codon61 | |
not_stop(x) = !has(Set("TAA","TAG","TGA"),x) | |
codon61 = T(filter(not_stop,codon64.alphabet)) | |
end | |
################################################### | |
module Tree | |
export T, size, leaves | |
# A tree with k leaves is defined by an array p of length l, l ∈ | |
# [k,2k-2]. p[i] is the index of the parent of node i. p[1:k] are the | |
# leaves, and i < p[i]. The total number of nodes is n=l+1, and p[i] ∈ | |
# [1,n]. | |
type T | |
k::Integer | |
parents::Array{Integer,1} | |
children::Array{Set{Integer},1} # inverts parents | |
branch_lengths::Array{Float64,1} | |
T(k,pts) = new(k,pts,children_from_parents(k,pts),[]) | |
T(k,pts,bls) = begin | |
assert(length(bls) == length(pts)) | |
assert(all((t) -> isfinite(isfinite(t) && t>=0), bls)) | |
new(k,pts,children_from_parents(k,pts),bls) | |
end | |
end | |
leaves(t::T) = t.k | |
size(t::T) = length(t.parents)+1 | |
function children_from_parents(k,pts) | |
assert(length(pts) >= k) | |
n = length(pts)+1 | |
children = [Set{Integer}() for i in 1:n] | |
for i in 1:length(pts) | |
assert(pts[i] > i && pts[i] <= n && (i>k||pts[i]>k)) | |
add!(children[pts[i]], i) | |
end | |
children | |
end | |
end | |
################################################### | |
module RateMatrix | |
export T, to_p | |
type T | |
matrix::Array{Float64,2} | |
evals::Array{Float64,1} | |
evecs::Array{Float64,2} | |
inv_evecs::Array{Float64,2} | |
pi::Array{Float64,1} | |
T(m) = begin | |
vals, vecs = eig(m) | |
ivecs = inv(vecs) | |
raw_pi = reshape(ivecs[indmin(abs(vals)),:], size(m,1)) | |
new(m,vals,vecs,ivecs,raw_pi/sum(raw_pi)) | |
end | |
end | |
to_p(q,t) = q.evecs*diagm(exp(q.evals*t))*q.inv_evecs | |
end | |
################################################### | |
module PhyloModel | |
export T | |
using Tree, RateMatrix | |
type T | |
tree::Tree.T | |
q_matrix::RateMatrix.T | |
p_matrices::Array{Array{Float64,2},1} | |
T(t,q) = begin | |
ps = [RateMatrix.to_p(q,x) for x in t.branch_lengths] | |
new(t,q,ps) | |
end | |
end | |
import Base.show | |
show(io::IO, m::T) = begin | |
print(io, "tree = " ) | |
Base.show(io, m.tree) | |
print(io, "\nq = ") | |
Base.show(io, m.q_matrix.matrix) | |
print(io, "\npi = ") | |
Base.show(io, m.q_matrix.pi) | |
end | |
end | |
################################################### | |
s = [ | |
0 .2 .1 .1; | |
.2 0 .1 .1; | |
.1 .1 0 .2; | |
.1 .1 .2 0 | |
] | |
pi = [0.2 0.3 0.3 0.2] | |
raw_q = (Float64)[s[i,j]*pi[j] for i in 1:length(Code.dna), j in 1:length(Code.dna)] | |
for i in 1:length(Code.dna) | |
raw_q[i,i] = 0-sum(raw_q[i,:]) | |
end | |
t = Tree.T(3,[5,4,4,5],[1.5,0.5,0.5,1.0]) | |
q = RateMatrix.T([ | |
-0.11 0.06 0.03 0.02; | |
0.04 -0.09 0.03 0.02; | |
0.02 0.03 -0.09 0.04; | |
0.02 0.03 0.06 -0.11; | |
]) | |
model = PhyloModel.T(t,q) | |
# Given nonnegative weights [w_1 w_2, ..., w_n], choose an index from | |
# [1,n] with probability proportional to the weights | |
function random_choice(weights) | |
assert(all((x) -> isfinite(x) && x >= 0, weights)) | |
# choose random x in [0,sum(weights)] | |
x = rand()*sum(weights) | |
# find the smallest i s.t. sum(weights[1:i]) >= x | |
for i in 1:(length(weights)-1) | |
x -= weights[i] | |
if x <= 0 | |
return i | |
end | |
end | |
return length(weights) | |
end | |
function sim_site(code,model) | |
n = Tree.size(model.tree) | |
chs = [-1 for i in 1:n] | |
# sample the root state from the equilibrium distribution | |
chs[n] = random_choice(model.q_matrix.pi) | |
# work down the tree, sampling the state at each node by applying | |
# the random substitution process to the previously-sampled state of | |
# is parent | |
for i in (n-1):-1:1 | |
pt_ch = chs[model.tree.parents[i]] | |
chs[i] = random_choice(model.p_matrices[i][pt_ch,:]) | |
end | |
# take the leaves and convert the state indices to the corresponding | |
# strings | |
return map((ch) -> code.alphabet[ch], | |
chs[1:Tree.leaves(model.tree)]) | |
end | |
################################################### | |
# logsumexp: given x = [log(a_1), log(a_2), ..., log(a_n)] | |
# compute log(a_1 + a_2 + ... + a_n) | |
function logsumexp(x) | |
mx = max(x) | |
mx+log(sum(exp(x-mx))) | |
end | |
function felsenstein(code::Code.T, model::PhyloModel.T, site) | |
# initialize the alpha matrix with log(0) | |
a = Array(Float64, Tree.size(model.tree), length(code)) | |
fill!(a,-Inf) | |
# initialize the rows corresponding to the leaves with log(1) in the | |
# column corresponding to the observed letter | |
for i in 1:Tree.leaves(model.tree) | |
a[i,code.indices[site[i]]] = 0 | |
end | |
# fill the rows corresponding to the nodes | |
for pt in (Tree.leaves(model.tree)+1):Tree.size(model.tree) | |
# for each possible letter at node pt... | |
for x in 1:length(code) | |
# for each child of node pt... | |
for ch in model.tree.children[pt] | |
# include the likelihood contribution of the branch from pt to | |
# ch, conditional on x in pt | |
lk = logsumexp(log(model.p_matrices[ch][x,:]) + a[ch,:]) | |
a[pt,x] = (isinf(a[pt,x]) ? 0 : a[pt,x]) + lk | |
end | |
end | |
end | |
# return the now-filled-in alpha matrix | |
a | |
end | |
function site_likelihood(code::Code.T, model::PhyloModel.T, site) | |
a = felsenstein(code,model,site) | |
a_root = reshape(a[Tree.size(model.tree),:],length(code)) | |
logsumexp(log(model.q_matrix.pi) + a_root) | |
end | |
################################################### | |
println(model) | |
println(felsenstein(Code.dna, model, ["A", "G", "G"])) | |
println(site_likelihood(Code.dna, model, ["A", "G", "G"])) | |
n = 1000000 | |
k = 0 | |
s = ["A","G","G"] | |
for i in 1:n | |
if sim_site(Code.dna, model) == s | |
k += 1 | |
end | |
end | |
println(100*k/n) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment