Skip to content

Instantly share code, notes, and snippets.

@mlin mlin/phylo.jl Secret
Last active Dec 17, 2015

Embed
What would you like to do?
A taste of molecular phylogenetics in Julia
module Code
export T, dna, aa, codon61, codon64
# a code is defined by an alphabet; indices is a reverse lookup from
# character to array index
type T
alphabet::Array{String,1}
indices::Dict{String,Integer}
T(chs) = begin
idxs = (String=>Integer)[chs[i] => i for i = 1:Base.length(chs)]
assert(length(idxs) == length(chs))
new(chs,idxs)
end
end
# size of the code
import Base.length
length(c::T) = length(c.alphabet)
# DNA
dna = T(["A","G","C","T"])
# amino acid sequences
aa = T(["A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V"])
# compute codon64
function to_codon(i)
n3 = i%4
n2 = div(i,4)%4
n1 = div(i,16)
dna.alphabet[n1+1]*dna.alphabet[n2+1]*dna.alphabet[n3+1]
end
codon64 = T([to_codon(i) for i in 0:63])
# filter codon64 to get codon61
not_stop(x) = !has(Set("TAA","TAG","TGA"),x)
codon61 = T(filter(not_stop,codon64.alphabet))
end
###################################################
module Tree
export T, size, leaves
# A tree with k leaves is defined by an array p of length l, l ∈
# [k,2k-2]. p[i] is the index of the parent of node i. p[1:k] are the
# leaves, and i < p[i]. The total number of nodes is n=l+1, and p[i] ∈
# [1,n].
type T
k::Integer
parents::Array{Integer,1}
children::Array{Set{Integer},1} # inverts parents
branch_lengths::Array{Float64,1}
T(k,pts) = new(k,pts,children_from_parents(k,pts),[])
T(k,pts,bls) = begin
assert(length(bls) == length(pts))
assert(all((t) -> isfinite(isfinite(t) && t>=0), bls))
new(k,pts,children_from_parents(k,pts),bls)
end
end
leaves(t::T) = t.k
size(t::T) = length(t.parents)+1
function children_from_parents(k,pts)
assert(length(pts) >= k)
n = length(pts)+1
children = [Set{Integer}() for i in 1:n]
for i in 1:length(pts)
assert(pts[i] > i && pts[i] <= n && (i>k||pts[i]>k))
add!(children[pts[i]], i)
end
children
end
end
###################################################
module RateMatrix
export T, to_p
type T
matrix::Array{Float64,2}
evals::Array{Float64,1}
evecs::Array{Float64,2}
inv_evecs::Array{Float64,2}
pi::Array{Float64,1}
T(m) = begin
vals, vecs = eig(m)
ivecs = inv(vecs)
raw_pi = reshape(ivecs[indmin(abs(vals)),:], size(m,1))
new(m,vals,vecs,ivecs,raw_pi/sum(raw_pi))
end
end
to_p(q,t) = q.evecs*diagm(exp(q.evals*t))*q.inv_evecs
end
###################################################
module PhyloModel
export T
using Tree, RateMatrix
type T
tree::Tree.T
q_matrix::RateMatrix.T
p_matrices::Array{Array{Float64,2},1}
T(t,q) = begin
ps = [RateMatrix.to_p(q,x) for x in t.branch_lengths]
new(t,q,ps)
end
end
import Base.show
show(io::IO, m::T) = begin
print(io, "tree = " )
Base.show(io, m.tree)
print(io, "\nq = ")
Base.show(io, m.q_matrix.matrix)
print(io, "\npi = ")
Base.show(io, m.q_matrix.pi)
end
end
###################################################
s = [
0 .2 .1 .1;
.2 0 .1 .1;
.1 .1 0 .2;
.1 .1 .2 0
]
pi = [0.2 0.3 0.3 0.2]
raw_q = (Float64)[s[i,j]*pi[j] for i in 1:length(Code.dna), j in 1:length(Code.dna)]
for i in 1:length(Code.dna)
raw_q[i,i] = 0-sum(raw_q[i,:])
end
t = Tree.T(3,[5,4,4,5],[1.5,0.5,0.5,1.0])
q = RateMatrix.T([
-0.11 0.06 0.03 0.02;
0.04 -0.09 0.03 0.02;
0.02 0.03 -0.09 0.04;
0.02 0.03 0.06 -0.11;
])
model = PhyloModel.T(t,q)
# Given nonnegative weights [w_1 w_2, ..., w_n], choose an index from
# [1,n] with probability proportional to the weights
function random_choice(weights)
assert(all((x) -> isfinite(x) && x >= 0, weights))
# choose random x in [0,sum(weights)]
x = rand()*sum(weights)
# find the smallest i s.t. sum(weights[1:i]) >= x
for i in 1:(length(weights)-1)
x -= weights[i]
if x <= 0
return i
end
end
return length(weights)
end
function sim_site(code,model)
n = Tree.size(model.tree)
chs = [-1 for i in 1:n]
# sample the root state from the equilibrium distribution
chs[n] = random_choice(model.q_matrix.pi)
# work down the tree, sampling the state at each node by applying
# the random substitution process to the previously-sampled state of
# is parent
for i in (n-1):-1:1
pt_ch = chs[model.tree.parents[i]]
chs[i] = random_choice(model.p_matrices[i][pt_ch,:])
end
# take the leaves and convert the state indices to the corresponding
# strings
return map((ch) -> code.alphabet[ch],
chs[1:Tree.leaves(model.tree)])
end
###################################################
# logsumexp: given x = [log(a_1), log(a_2), ..., log(a_n)]
# compute log(a_1 + a_2 + ... + a_n)
function logsumexp(x)
mx = max(x)
mx+log(sum(exp(x-mx)))
end
function felsenstein(code::Code.T, model::PhyloModel.T, site)
# initialize the alpha matrix with log(0)
a = Array(Float64, Tree.size(model.tree), length(code))
fill!(a,-Inf)
# initialize the rows corresponding to the leaves with log(1) in the
# column corresponding to the observed letter
for i in 1:Tree.leaves(model.tree)
a[i,code.indices[site[i]]] = 0
end
# fill the rows corresponding to the nodes
for pt in (Tree.leaves(model.tree)+1):Tree.size(model.tree)
# for each possible letter at node pt...
for x in 1:length(code)
# for each child of node pt...
for ch in model.tree.children[pt]
# include the likelihood contribution of the branch from pt to
# ch, conditional on x in pt
lk = logsumexp(log(model.p_matrices[ch][x,:]) + a[ch,:])
a[pt,x] = (isinf(a[pt,x]) ? 0 : a[pt,x]) + lk
end
end
end
# return the now-filled-in alpha matrix
a
end
function site_likelihood(code::Code.T, model::PhyloModel.T, site)
a = felsenstein(code,model,site)
a_root = reshape(a[Tree.size(model.tree),:],length(code))
logsumexp(log(model.q_matrix.pi) + a_root)
end
###################################################
println(model)
println(felsenstein(Code.dna, model, ["A", "G", "G"]))
println(site_likelihood(Code.dna, model, ["A", "G", "G"]))
n = 1000000
k = 0
s = ["A","G","G"]
for i in 1:n
if sim_site(Code.dna, model) == s
k += 1
end
end
println(100*k/n)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.