Created
November 18, 2015 19:22
-
-
Save oseiskar/e4b4de21e6471a52cda1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% GNU Octave | |
% Computes a maximum a posteriori (MAP) estimate X for the hidden states | |
% for the given DNA sequence Q under the pre-defined hidden Markov model | |
% Depending on what the mode parameter is, this computes | |
% | |
% 'any-sequence' (default) | |
% | |
% X = (x_1, ..., x_n) \in argmax_{x_1, ..., x_n} P(Q|x_1,...,x_n) | |
% | |
% 'all-sequences' (there can be many MAP sequences) | |
% | |
% X = argmax_{x_1, ... , x_n} P(QY|x_1,...x_n) | |
% | |
% 'last-state' (not the same as last element of any MAP sequence) | |
% | |
% X = argmax_{x_n} P(Q|x_n) | |
% | |
function [MAP_estimate, prob] = viterbi(dna_sequence, mode = 'any-sequence') | |
% define model | |
model = struct(); | |
% Names of the hidden states | |
model.state_names = 'HL'; | |
% Hidden markov model transition probability matrix | |
model.T = [0.5, 0.4; | |
0.5, 0.6]; | |
% Names/codes for the observations (the nucleotide names) | |
model.observation_codes = 'ACGT'; | |
% Observation probability matrix | |
model.O = [0.2, 0.3; | |
0.3, 0.2; | |
0.3, 0.2; | |
0.2, 0.3]; | |
% Initial probabilities for the hidden states | |
model.p0 = [0.5; 0.5]; | |
% Choose what to compute | |
switch mode | |
case 'any-sequence' | |
MAP_estimate = viterbi_MAP_state_sequence(model, dna_sequence); | |
case 'all-sequences' | |
MAP_estimate = viterbi_all_MAP_state_sequences(model, dna_sequence); | |
case 'last-state' | |
[MAP_estimate, prob] = bayes_smoother_last_state(model, dna_sequence); | |
otherwise | |
error error; | |
endswitch | |
endfunction | |
% Helper: decode observation codes to indices, e.g., GATTACA -> [3,1,4,4,1,2,1] | |
% this is a O(h*n) operation | |
function indices = decode_sequence(model, s) | |
indices = zeros(size(s)); | |
for ii=1:numel(s) | |
indices(ii) = find(model.observation_codes == s(ii)); | |
end | |
endfunction | |
% Find a MAP sequence using the Viterbi algorithm: O(n*h^2) | |
function MAP_sequence = viterbi_MAP_state_sequence(model, q) | |
n = numel(q); | |
h = numel(model.state_names); | |
% Convert codes to observation matrix row indices: ACGT -> 1234 | |
q_idx = decode_sequence(model, q); | |
% Viterbi algorithm / dynamic programming | |
% Use probability logarithms to avoid underflow | |
E = zeros(h, n); | |
log_P = log(model.p0) + log(model.O(q_idx(1),:)).'; | |
for ii=2:n, | |
log_P_prev = log_P; | |
for jj=1:h | |
probs = log(model.O(q_idx(ii),jj)) + log(model.T(jj,:)).' + log_P_prev; | |
[log_P(jj), E(jj,ii)] = max(probs); | |
end | |
end | |
[max_val, edge] = max(log_P); | |
% initialize as a vector of same length and type as q | |
MAP_sequence = q; | |
% Backtrack to construct a maximizing sequence. | |
% | |
% This is O(n) due to the above preallocation; for vectors of growing size | |
% it could be O(n^2) | |
for ii=n:-1:1, | |
MAP_sequence(ii) = model.state_names(edge); | |
if ii > 0, | |
edge = E(edge,ii); | |
end | |
end | |
endfunction | |
% Find the most probable last state(s) using the Viterbi algorithm: O(n*h^2) | |
function [MAP_last_state, prob] = bayes_smoother_last_state(model, q) | |
n = numel(q); | |
h = numel(model.state_names); | |
q_idx = decode_sequence(model, q); | |
P = model.p0 .* model.O(q_idx(1),:).'; | |
P = P/sum(P); | |
for ii=2:n, | |
P = model.O(q_idx(ii),:).' .* (model.T * P); | |
% normalize probabilities to avoid underflow | |
probs = P/sum(P); | |
end | |
[prob, index] = max(probs); | |
MAP_last_state = model.state_names(index); | |
endfunction | |
% Enumerate ALL MAP sequences. Constructing a graph defining all the possible | |
% sequences is O(n*h^2) but enumerating them is O(h^n) in the worst case, | |
% since, in theory, there can be that many MAP sequences. They could be | |
% counted in O(n*h^2) using dynamic programming, though | |
function MAP_sequences = viterbi_all_MAP_state_sequences(model, q) | |
n = numel(q); | |
h = numel(model.state_names); | |
q_idx = decode_sequence(model, q); | |
% Relative threshold for checking if two probabilities encountered in the | |
% Viterbi algorithm are the same | |
EPSILON = 1e-10; | |
log_P = log(model.p0) + log(model.O(q_idx(1),:)).'; | |
E = zeros(h, h, n); | |
for ii=2:n, | |
log_P_prev = log_P; | |
for jj=1:h | |
probs = log(model.O(q_idx(ii),jj)) + log(model.T(jj,:)).' + log_P_prev; | |
log_P(jj) = max(probs); | |
E(jj,:,ii) = abs(probs-log_P(jj)) < log(1+EPSILON); | |
end | |
end | |
% Enumerate all MAP sequeces by starting to build them from the end | |
max_indices = find(abs(log_P-max(log_P)) < log(1+EPSILON)); | |
sequences = num2cell(max_indices); | |
for ii=n:-1:2, | |
new_sequences = {}; | |
for jj=1:numel(sequences), | |
s = sequences{jj}; | |
for node=find(E(s(1),:,ii)) | |
new_sequences{end+1} = [node, s]; | |
end | |
end | |
sequences = new_sequences; | |
end | |
MAP_sequences = {}; | |
for ii=1:numel(sequences) | |
MAP_sequences{end+1} = model.state_names(sequences{ii}); | |
end | |
endfunction |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment