oseiskar/viterbi.m

## viterbi.m
%% GNU Octave

% Computes a maximum a posteriori (MAP) estimate X for the hidden states
% for the given DNA sequence Q under the pre-defined hidden Markov model
% Depending on what the mode parameter is, this computes
%
%   'any-sequence' (default)
%
%       X = (x_1, ..., x_n) \in argmax_{x_1, ..., x_n} P(Q|x_1,...,x_n)
%
%   'all-sequences' (there can be many MAP sequences)
%
%       X = argmax_{x_1, ... , x_n} P(QY|x_1,...x_n)
%
%   'last-state' (not the same as last element of any MAP sequence)
%
%       X = argmax_{x_n} P(Q|x_n)
%
function [MAP_estimate, prob] = viterbi(dna_sequence, mode = 'any-sequence')

    % define model
    model = struct();

    % Names of the hidden states
    model.state_names = 'HL';

    % Hidden markov model transition probability matrix
    model.T = [0.5, 0.4;
               0.5, 0.6];

    % Names/codes for the observations (the nucleotide names)
    model.observation_codes = 'ACGT';

    % Observation probability matrix
    model.O = [0.2, 0.3;
               0.3, 0.2;
               0.3, 0.2;
               0.2, 0.3];

    % Initial probabilities for the hidden states
    model.p0 = [0.5; 0.5];

    % Choose what to compute
    switch mode
    case 'any-sequence'
        MAP_estimate = viterbi_MAP_state_sequence(model, dna_sequence);

    case 'all-sequences'
        MAP_estimate = viterbi_all_MAP_state_sequences(model, dna_sequence);

    case 'last-state'
        [MAP_estimate, prob] = bayes_smoother_last_state(model, dna_sequence);

    otherwise
        error error;

    endswitch

endfunction

% Helper: decode observation codes to indices, e.g., GATTACA -> [3,1,4,4,1,2,1]
% this is a O(h*n) operation
function indices = decode_sequence(model, s)
    indices = zeros(size(s));
    for ii=1:numel(s)
        indices(ii) = find(model.observation_codes == s(ii));
    end
endfunction

% Find a MAP sequence using the Viterbi algorithm: O(n*h^2)
function MAP_sequence = viterbi_MAP_state_sequence(model, q)

    n = numel(q);
    h = numel(model.state_names);

    % Convert codes to observation matrix row indices: ACGT -> 1234
    q_idx = decode_sequence(model, q);

    % Viterbi algorithm / dynamic programming
    % Use probability logarithms to avoid underflow
    E = zeros(h, n);
    log_P = log(model.p0) + log(model.O(q_idx(1),:)).';
    for ii=2:n,
        log_P_prev = log_P;
        for jj=1:h
            probs = log(model.O(q_idx(ii),jj)) + log(model.T(jj,:)).' + log_P_prev;
            [log_P(jj), E(jj,ii)] = max(probs);
        end
    end

    [max_val, edge] = max(log_P);

    % initialize as a vector of same length and type as q
    MAP_sequence = q;

    % Backtrack to construct a maximizing sequence.
    %
    % This is O(n) due to the above preallocation; for vectors of growing size
    % it could be O(n^2)
    for ii=n:-1:1,
        MAP_sequence(ii) = model.state_names(edge);
        if ii > 0,
            edge = E(edge,ii);
        end
    end
endfunction

% Find the most probable last state(s) using the Viterbi algorithm: O(n*h^2)
function [MAP_last_state, prob] = bayes_smoother_last_state(model, q)

    n = numel(q);
    h = numel(model.state_names);

    q_idx = decode_sequence(model, q);

    P = model.p0 .* model.O(q_idx(1),:).';
    P = P/sum(P);
    for ii=2:n,
        P = model.O(q_idx(ii),:).' .* (model.T * P);
        % normalize probabilities to avoid underflow
        probs = P/sum(P);
    end

    [prob, index] = max(probs);
    MAP_last_state = model.state_names(index);
endfunction

% Enumerate ALL MAP sequences. Constructing a graph defining all the possible
% sequences is O(n*h^2) but enumerating them is O(h^n) in the worst case,
% since, in theory, there can be that many MAP sequences. They could be
% counted in O(n*h^2) using dynamic programming, though
function MAP_sequences = viterbi_all_MAP_state_sequences(model, q)

    n = numel(q);
    h = numel(model.state_names);

    q_idx = decode_sequence(model, q);

    % Relative threshold for checking if two probabilities encountered in the
    % Viterbi algorithm are the same
    EPSILON = 1e-10;

    log_P = log(model.p0) + log(model.O(q_idx(1),:)).';
    E = zeros(h, h, n);
    for ii=2:n,
        log_P_prev = log_P;
        for jj=1:h
            probs = log(model.O(q_idx(ii),jj)) + log(model.T(jj,:)).' + log_P_prev;
            log_P(jj) = max(probs);

            E(jj,:,ii) = abs(probs-log_P(jj)) < log(1+EPSILON);
        end
    end

    % Enumerate all MAP sequeces by starting to build them from the end
    max_indices = find(abs(log_P-max(log_P)) < log(1+EPSILON));
    sequences = num2cell(max_indices);

    for ii=n:-1:2,
        new_sequences = {};
        for jj=1:numel(sequences),
            s = sequences{jj};
            for node=find(E(s(1),:,ii))
                new_sequences{end+1} = [node, s];
            end
        end
        sequences = new_sequences;
    end

    MAP_sequences = {};
    for ii=1:numel(sequences)
        MAP_sequences{end+1} = model.state_names(sequences{ii});
    end
endfunction
	%% GNU Octave

	% Computes a maximum a posteriori (MAP) estimate X for the hidden states
	% for the given DNA sequence Q under the pre-defined hidden Markov model
	% Depending on what the mode parameter is, this computes
	%
	% 'any-sequence' (default)
	%
	% X = (x_1, ..., x_n) \in argmax_{x_1, ..., x_n} P(Q\|x_1,...,x_n)
	%
	% 'all-sequences' (there can be many MAP sequences)
	%
	% X = argmax_{x_1, ... , x_n} P(QY\|x_1,...x_n)
	%
	% 'last-state' (not the same as last element of any MAP sequence)
	%
	% X = argmax_{x_n} P(Q\|x_n)
	%
	function [MAP_estimate, prob] = viterbi(dna_sequence, mode = 'any-sequence')

	% define model
	model = struct();

	% Names of the hidden states
	model.state_names = 'HL';

	% Hidden markov model transition probability matrix
	model.T = [0.5, 0.4;
	0.5, 0.6];

	% Names/codes for the observations (the nucleotide names)
	model.observation_codes = 'ACGT';

	% Observation probability matrix
	model.O = [0.2, 0.3;
	0.3, 0.2;
	0.3, 0.2;
	0.2, 0.3];

	% Initial probabilities for the hidden states
	model.p0 = [0.5; 0.5];

	% Choose what to compute
	switch mode
	case 'any-sequence'
	MAP_estimate = viterbi_MAP_state_sequence(model, dna_sequence);

	case 'all-sequences'
	MAP_estimate = viterbi_all_MAP_state_sequences(model, dna_sequence);

	case 'last-state'
	[MAP_estimate, prob] = bayes_smoother_last_state(model, dna_sequence);

	otherwise
	error error;

	endswitch

	endfunction

	% Helper: decode observation codes to indices, e.g., GATTACA -> [3,1,4,4,1,2,1]
	% this is a O(h*n) operation
	function indices = decode_sequence(model, s)
	indices = zeros(size(s));
	for ii=1:numel(s)
	indices(ii) = find(model.observation_codes == s(ii));
	end
	endfunction

	% Find a MAP sequence using the Viterbi algorithm: O(n*h^2)
	function MAP_sequence = viterbi_MAP_state_sequence(model, q)

	n = numel(q);
	h = numel(model.state_names);

	% Convert codes to observation matrix row indices: ACGT -> 1234
	q_idx = decode_sequence(model, q);

	% Viterbi algorithm / dynamic programming
	% Use probability logarithms to avoid underflow
	E = zeros(h, n);
	log_P = log(model.p0) + log(model.O(q_idx(1),:)).';
	for ii=2:n,
	log_P_prev = log_P;
	for jj=1:h
	probs = log(model.O(q_idx(ii),jj)) + log(model.T(jj,:)).' + log_P_prev;
	[log_P(jj), E(jj,ii)] = max(probs);
	end
	end

	[max_val, edge] = max(log_P);

	% initialize as a vector of same length and type as q
	MAP_sequence = q;

	% Backtrack to construct a maximizing sequence.
	%
	% This is O(n) due to the above preallocation; for vectors of growing size
	% it could be O(n^2)
	for ii=n:-1:1,
	MAP_sequence(ii) = model.state_names(edge);
	if ii > 0,
	edge = E(edge,ii);
	end
	end
	endfunction

	% Find the most probable last state(s) using the Viterbi algorithm: O(n*h^2)
	function [MAP_last_state, prob] = bayes_smoother_last_state(model, q)

	n = numel(q);
	h = numel(model.state_names);

	q_idx = decode_sequence(model, q);

	P = model.p0 .* model.O(q_idx(1),:).';
	P = P/sum(P);
	for ii=2:n,
	P = model.O(q_idx(ii),:).' .* (model.T * P);
	% normalize probabilities to avoid underflow
	probs = P/sum(P);
	end

	[prob, index] = max(probs);
	MAP_last_state = model.state_names(index);
	endfunction

	% Enumerate ALL MAP sequences. Constructing a graph defining all the possible
	% sequences is O(n*h^2) but enumerating them is O(h^n) in the worst case,
	% since, in theory, there can be that many MAP sequences. They could be
	% counted in O(n*h^2) using dynamic programming, though
	function MAP_sequences = viterbi_all_MAP_state_sequences(model, q)

	n = numel(q);
	h = numel(model.state_names);

	q_idx = decode_sequence(model, q);

	% Relative threshold for checking if two probabilities encountered in the
	% Viterbi algorithm are the same
	EPSILON = 1e-10;

	log_P = log(model.p0) + log(model.O(q_idx(1),:)).';
	E = zeros(h, h, n);
	for ii=2:n,
	log_P_prev = log_P;
	for jj=1:h
	probs = log(model.O(q_idx(ii),jj)) + log(model.T(jj,:)).' + log_P_prev;
	log_P(jj) = max(probs);

	E(jj,:,ii) = abs(probs-log_P(jj)) < log(1+EPSILON);
	end
	end

	% Enumerate all MAP sequeces by starting to build them from the end
	max_indices = find(abs(log_P-max(log_P)) < log(1+EPSILON));
	sequences = num2cell(max_indices);

	for ii=n:-1:2,
	new_sequences = {};
	for jj=1:numel(sequences),
	s = sequences{jj};
	for node=find(E(s(1),:,ii))
	new_sequences{end+1} = [node, s];
	end
	end
	sequences = new_sequences;
	end

	MAP_sequences = {};
	for ii=1:numel(sequences)
	MAP_sequences{end+1} = model.state_names(sequences{ii});
	end
	endfunction