Skip to content

Instantly share code, notes, and snippets.

@oseiskar
Created November 18, 2015 19:22
Show Gist options
  • Save oseiskar/e4b4de21e6471a52cda1 to your computer and use it in GitHub Desktop.
Save oseiskar/e4b4de21e6471a52cda1 to your computer and use it in GitHub Desktop.
%% GNU Octave
% Computes a maximum a posteriori (MAP) estimate X for the hidden states
% for the given DNA sequence Q under the pre-defined hidden Markov model
% Depending on what the mode parameter is, this computes
%
% 'any-sequence' (default)
%
% X = (x_1, ..., x_n) \in argmax_{x_1, ..., x_n} P(Q|x_1,...,x_n)
%
% 'all-sequences' (there can be many MAP sequences)
%
% X = argmax_{x_1, ... , x_n} P(QY|x_1,...x_n)
%
% 'last-state' (not the same as last element of any MAP sequence)
%
% X = argmax_{x_n} P(Q|x_n)
%
function [MAP_estimate, prob] = viterbi(dna_sequence, mode = 'any-sequence')
% define model
model = struct();
% Names of the hidden states
model.state_names = 'HL';
% Hidden markov model transition probability matrix
model.T = [0.5, 0.4;
0.5, 0.6];
% Names/codes for the observations (the nucleotide names)
model.observation_codes = 'ACGT';
% Observation probability matrix
model.O = [0.2, 0.3;
0.3, 0.2;
0.3, 0.2;
0.2, 0.3];
% Initial probabilities for the hidden states
model.p0 = [0.5; 0.5];
% Choose what to compute
switch mode
case 'any-sequence'
MAP_estimate = viterbi_MAP_state_sequence(model, dna_sequence);
case 'all-sequences'
MAP_estimate = viterbi_all_MAP_state_sequences(model, dna_sequence);
case 'last-state'
[MAP_estimate, prob] = bayes_smoother_last_state(model, dna_sequence);
otherwise
error error;
endswitch
endfunction
% Helper: decode observation codes to indices, e.g., GATTACA -> [3,1,4,4,1,2,1]
% this is a O(h*n) operation
function indices = decode_sequence(model, s)
indices = zeros(size(s));
for ii=1:numel(s)
indices(ii) = find(model.observation_codes == s(ii));
end
endfunction
% Find a MAP sequence using the Viterbi algorithm: O(n*h^2)
function MAP_sequence = viterbi_MAP_state_sequence(model, q)
n = numel(q);
h = numel(model.state_names);
% Convert codes to observation matrix row indices: ACGT -> 1234
q_idx = decode_sequence(model, q);
% Viterbi algorithm / dynamic programming
% Use probability logarithms to avoid underflow
E = zeros(h, n);
log_P = log(model.p0) + log(model.O(q_idx(1),:)).';
for ii=2:n,
log_P_prev = log_P;
for jj=1:h
probs = log(model.O(q_idx(ii),jj)) + log(model.T(jj,:)).' + log_P_prev;
[log_P(jj), E(jj,ii)] = max(probs);
end
end
[max_val, edge] = max(log_P);
% initialize as a vector of same length and type as q
MAP_sequence = q;
% Backtrack to construct a maximizing sequence.
%
% This is O(n) due to the above preallocation; for vectors of growing size
% it could be O(n^2)
for ii=n:-1:1,
MAP_sequence(ii) = model.state_names(edge);
if ii > 0,
edge = E(edge,ii);
end
end
endfunction
% Find the most probable last state(s) using the Viterbi algorithm: O(n*h^2)
function [MAP_last_state, prob] = bayes_smoother_last_state(model, q)
n = numel(q);
h = numel(model.state_names);
q_idx = decode_sequence(model, q);
P = model.p0 .* model.O(q_idx(1),:).';
P = P/sum(P);
for ii=2:n,
P = model.O(q_idx(ii),:).' .* (model.T * P);
% normalize probabilities to avoid underflow
probs = P/sum(P);
end
[prob, index] = max(probs);
MAP_last_state = model.state_names(index);
endfunction
% Enumerate ALL MAP sequences. Constructing a graph defining all the possible
% sequences is O(n*h^2) but enumerating them is O(h^n) in the worst case,
% since, in theory, there can be that many MAP sequences. They could be
% counted in O(n*h^2) using dynamic programming, though
function MAP_sequences = viterbi_all_MAP_state_sequences(model, q)
n = numel(q);
h = numel(model.state_names);
q_idx = decode_sequence(model, q);
% Relative threshold for checking if two probabilities encountered in the
% Viterbi algorithm are the same
EPSILON = 1e-10;
log_P = log(model.p0) + log(model.O(q_idx(1),:)).';
E = zeros(h, h, n);
for ii=2:n,
log_P_prev = log_P;
for jj=1:h
probs = log(model.O(q_idx(ii),jj)) + log(model.T(jj,:)).' + log_P_prev;
log_P(jj) = max(probs);
E(jj,:,ii) = abs(probs-log_P(jj)) < log(1+EPSILON);
end
end
% Enumerate all MAP sequeces by starting to build them from the end
max_indices = find(abs(log_P-max(log_P)) < log(1+EPSILON));
sequences = num2cell(max_indices);
for ii=n:-1:2,
new_sequences = {};
for jj=1:numel(sequences),
s = sequences{jj};
for node=find(E(s(1),:,ii))
new_sequences{end+1} = [node, s];
end
end
sequences = new_sequences;
end
MAP_sequences = {};
for ii=1:numel(sequences)
MAP_sequences{end+1} = model.state_names(sequences{ii});
end
endfunction
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment