Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
GMM-HMM (Hidden markov model with Gaussian mixture emissions) implementation for speech recognition and other uses
# (C) Kyle Kastner, June 2014
# License: BSD 3 clause
import scipy.stats as st
import numpy as np
class gmmhmm:
#This class converted with modifications from https://code.google.com/p/hmm-speech-recognition/source/browse/Word.m
def __init__(self, n_states):
self.n_states = n_states
self.random_state = np.random.RandomState(0)
#Normalize random initial state
self.prior = self._normalize(self.random_state.rand(self.n_states, 1))
self.A = self._stochasticize(self.random_state.rand(self.n_states, self.n_states))
self.mu = None
self.covs = None
self.n_dims = None
def _forward(self, B):
log_likelihood = 0.
T = B.shape[1]
alpha = np.zeros(B.shape)
for t in range(T):
if t == 0:
alpha[:, t] = B[:, t] * self.prior.ravel()
else:
alpha[:, t] = B[:, t] * np.dot(self.A.T, alpha[:, t - 1])
alpha_sum = np.sum(alpha[:, t])
alpha[:, t] /= alpha_sum
log_likelihood = log_likelihood + np.log(alpha_sum)
return log_likelihood, alpha
def _backward(self, B):
T = B.shape[1]
beta = np.zeros(B.shape);
beta[:, -1] = np.ones(B.shape[0])
for t in range(T - 1)[::-1]:
beta[:, t] = np.dot(self.A, (B[:, t + 1] * beta[:, t + 1]))
beta[:, t] /= np.sum(beta[:, t])
return beta
def _state_likelihood(self, obs):
obs = np.atleast_2d(obs)
B = np.zeros((self.n_states, obs.shape[1]))
for s in range(self.n_states):
#Needs scipy 0.14
B[s, :] = st.multivariate_normal.pdf(obs.T, mean=self.mu[:, s].T, cov=self.covs[:, :, s].T)
#This function can (and will!) return values >> 1
#See the discussion here for the equivalent matlab function
#https://groups.google.com/forum/#!topic/comp.soft-sys.matlab/YksWK0T74Ak
#Key line: "Probabilities have to be less than 1,
#Densities can be anything, even infinite (at individual points)."
#This is evaluating the density at individual points...
return B
def _normalize(self, x):
return (x + (x == 0)) / np.sum(x)
def _stochasticize(self, x):
return (x + (x == 0)) / np.sum(x, axis=1)
def _em_init(self, obs):
#Using this _em_init function allows for less required constructor args
if self.n_dims is None:
self.n_dims = obs.shape[0]
if self.mu is None:
subset = self.random_state.choice(np.arange(self.n_dims), size=self.n_states, replace=False)
self.mu = obs[:, subset]
if self.covs is None:
self.covs = np.zeros((self.n_dims, self.n_dims, self.n_states))
self.covs += np.diag(np.diag(np.cov(obs)))[:, :, None]
return self
def _em_step(self, obs):
obs = np.atleast_2d(obs)
B = self._state_likelihood(obs)
T = obs.shape[1]
log_likelihood, alpha = self._forward(B)
beta = self._backward(B)
xi_sum = np.zeros((self.n_states, self.n_states))
gamma = np.zeros((self.n_states, T))
for t in range(T - 1):
partial_sum = self.A * np.dot(alpha[:, t], (beta[:, t] * B[:, t + 1]).T)
xi_sum += self._normalize(partial_sum)
partial_g = alpha[:, t] * beta[:, t]
gamma[:, t] = self._normalize(partial_g)
partial_g = alpha[:, -1] * beta[:, -1]
gamma[:, -1] = self._normalize(partial_g)
expected_prior = gamma[:, 0]
expected_A = self._stochasticize(xi_sum)
expected_mu = np.zeros((self.n_dims, self.n_states))
expected_covs = np.zeros((self.n_dims, self.n_dims, self.n_states))
gamma_state_sum = np.sum(gamma, axis=1)
#Set zeros to 1 before dividing
gamma_state_sum = gamma_state_sum + (gamma_state_sum == 0)
for s in range(self.n_states):
gamma_obs = obs * gamma[s, :]
expected_mu[:, s] = np.sum(gamma_obs, axis=1) / gamma_state_sum[s]
partial_covs = np.dot(gamma_obs, obs.T) / gamma_state_sum[s] - np.dot(expected_mu[:, s], expected_mu[:, s].T)
#Symmetrize
partial_covs = np.triu(partial_covs) + np.triu(partial_covs).T - np.diag(partial_covs)
#Ensure positive semidefinite by adding diagonal loading
expected_covs += .01 * np.eye(self.n_dims)[:, :, None]
self.prior = expected_prior
self.mu = expected_mu
self.covs = expected_covs
self.A = expected_A
return log_likelihood
def fit(self, obs, n_iter=15):
#Support for 2D and 3D arrays
#2D should be n_features, n_dims
#3D should be n_examples, n_features, n_dims
#For example, with 6 features per speech segment, 105 different words
#this array should be size
#(105, 6, X) where X is the number of frames with features extracted
#For a single example file, the array should be size (6, X)
if len(obs.shape) == 2:
for i in range(n_iter):
self._em_init(obs)
log_likelihood = self._em_step(obs)
elif len(obs.shape) == 3:
count = obs.shape[0]
for n in range(count):
for i in range(n_iter):
self._em_init(obs[n, :, :])
log_likelihood = self._em_step(obs[n, :, :])
return self
def transform(self, obs):
#Support for 2D and 3D arrays
#2D should be n_features, n_dims
#3D should be n_examples, n_features, n_dims
#For example, with 6 features per speech segment, 105 different words
#this array should be size
#(105, 6, X) where X is the number of frames with features extracted
#For a single example file, the array should be size (6, X)
if len(obs.shape) == 2:
B = self._state_likelihood(obs)
log_likelihood, _ = self._forward(B)
return log_likelihood
elif len(obs.shape) == 3:
count = obs.shape[0]
out = np.zeros((count,))
for n in range(count):
B = self._state_likelihood(obs[n, :, :])
log_likelihood, _ = self._forward(B)
out[n] = log_likelihood
return out
if __name__ == "__main__":
rstate = np.random.RandomState(0)
t1 = np.ones((4, 40)) + .001 * rstate.rand(4, 40)
t1 /= t1.sum(axis=0)
t2 = rstate.rand(*t1.shape)
t2 /= t2.sum(axis=0)
m1 = gmmhmm(2)
m1.fit(t1)
m2 = gmmhmm(2)
m2.fit(t2)
m1t1 = m1.transform(t1)
m2t1 = m2.transform(t1)
print "Likelihoods for test set 1"
print "M1:",m1t1
print "M2:",m2t1
print "Prediction for test set 1"
print "Model", np.argmax([m1t1, m2t1]) + 1
print
m1t2 = m1.transform(t2)
m2t2 = m2.transform(t2)
print "Likelihoods for test set 2"
print "M1:",m1t2
print "M2:",m2t2
print "Prediction for test set 2"
print "Model", np.argmax([m1t2, m2t2]) + 1
@aspats

This comment has been minimized.

Show comment Hide comment
@aspats

aspats Feb 17, 2015

Hi kastnerkyle,

I am really intresting to find good example of GMM-HMM for for speech recognition.
I wanted to use your code to test matlab example https://code.google.com/p/hmm-speech-recognition/source/browse/Word.m from which you made python code. So I tried to use matlab project audio files. But I found that in your python code could be mistake:

partial_covs = np.triu(partial_covs) + np.triu(partial_covs).T - np.diag(partial_covs)

should be

expected_covs[:,:, s] = np.triu(partial_covs) + np.triu(partial_covs).T - np.diag(partial_covs)

With real audio files where are in matlab project I tried to make work it with your python code and I couldnt make it work. Your example are working with generated numbers, but with real audio didnt work. So when I found that problem which I describe before after that I get a new one error:

"ValueError: the input matrix must be positive semidefinite"

And it is come from def _state_likelihood(self, obs) function.

aspats commented Feb 17, 2015

Hi kastnerkyle,

I am really intresting to find good example of GMM-HMM for for speech recognition.
I wanted to use your code to test matlab example https://code.google.com/p/hmm-speech-recognition/source/browse/Word.m from which you made python code. So I tried to use matlab project audio files. But I found that in your python code could be mistake:

partial_covs = np.triu(partial_covs) + np.triu(partial_covs).T - np.diag(partial_covs)

should be

expected_covs[:,:, s] = np.triu(partial_covs) + np.triu(partial_covs).T - np.diag(partial_covs)

With real audio files where are in matlab project I tried to make work it with your python code and I couldnt make it work. Your example are working with generated numbers, but with real audio didnt work. So when I found that problem which I describe before after that I get a new one error:

"ValueError: the input matrix must be positive semidefinite"

And it is come from def _state_likelihood(self, obs) function.

@kastnerkyle

This comment has been minimized.

Show comment Hide comment
@kastnerkyle

kastnerkyle Apr 14, 2016

Hi, I didn't have issues with this code here: http://kastnerkyle.github.io/posts/single-speaker-word-recognition-with-hidden-markov-models/ and it was using the same files from that MATLAB demo. Can you describe a bit more what you did? One thing you can try is adding a small positive constant to the diagonal of the covariance in the _state_likelihood - that often helps with these small numerical issues.

Owner

kastnerkyle commented Apr 14, 2016

Hi, I didn't have issues with this code here: http://kastnerkyle.github.io/posts/single-speaker-word-recognition-with-hidden-markov-models/ and it was using the same files from that MATLAB demo. Can you describe a bit more what you did? One thing you can try is adding a small positive constant to the diagonal of the covariance in the _state_likelihood - that often helps with these small numerical issues.

@kbagalo

This comment has been minimized.

Show comment Hide comment
@kbagalo

kbagalo Oct 20, 2016

Hello Kastnerkyle,
I am trying to implement the example you have given, (apple-banana-pineapple,,,) using the hmmlearn python module. I am unable to use the model.fit(X) command properly, as I can't make sense of what X should be like. (I have understood what it is in your implementation). Could you please guide me in this case?
Any response would be greatly appreciated.

kbagalo commented Oct 20, 2016

Hello Kastnerkyle,
I am trying to implement the example you have given, (apple-banana-pineapple,,,) using the hmmlearn python module. I am unable to use the model.fit(X) command properly, as I can't make sense of what X should be like. (I have understood what it is in your implementation). Could you please guide me in this case?
Any response would be greatly appreciated.

@yvanlucas

This comment has been minimized.

Show comment Hide comment
@yvanlucas

yvanlucas Oct 27, 2016

@kbagalo

You might want to look at the help of HMMLEARN for this purpose: https://github.com/hmmlearn/hmmlearn/blob/master/doc/tutorial.rst

Just copy paste of the thing you need (if I understood well):

Working with multiple sequences

All of the examples so far were using a single sequence of observations. The input format in the case of multiple sequences is a bit involved and is best understood by example.

Consider two 1D sequences:

X1 = [[0.5], [1.0], [-1.0], [0.42], [0.24]]
X2 = [[2.4], [4.2], [0.5], [-0.24]]

To pass both sequences to :meth:~base._BaseHMM.fit or :meth:~base._BaseHMM.predict first concatenate them into a single array and then compute an array of sequence lengths:

X = np.concatenate([X1, X2])
lengths = [len(X1), len(X2)]

Finally just call the desired method with X and lengths:

hmm.GaussianHMM(n_components=3).fit(X, lengths) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
GaussianHMM(algorithm='viterbi', ...


@kbagalo

You might want to look at the help of HMMLEARN for this purpose: https://github.com/hmmlearn/hmmlearn/blob/master/doc/tutorial.rst

Just copy paste of the thing you need (if I understood well):

Working with multiple sequences

All of the examples so far were using a single sequence of observations. The input format in the case of multiple sequences is a bit involved and is best understood by example.

Consider two 1D sequences:

X1 = [[0.5], [1.0], [-1.0], [0.42], [0.24]]
X2 = [[2.4], [4.2], [0.5], [-0.24]]

To pass both sequences to :meth:~base._BaseHMM.fit or :meth:~base._BaseHMM.predict first concatenate them into a single array and then compute an array of sequence lengths:

X = np.concatenate([X1, X2])
lengths = [len(X1), len(X2)]

Finally just call the desired method with X and lengths:

hmm.GaussianHMM(n_components=3).fit(X, lengths) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
GaussianHMM(algorithm='viterbi', ...


@iichangle

This comment has been minimized.

Show comment Hide comment
@iichangle

iichangle Apr 28, 2017

Hi kastnerkyle,
I found that the python code above is a GaussianHMM instead of a GMMHMM as the emission distribution for one dimension has only one center, so there is no "mixture". I wonder if it is right. Thank you.

iichangle commented Apr 28, 2017

Hi kastnerkyle,
I found that the python code above is a GaussianHMM instead of a GMMHMM as the emission distribution for one dimension has only one center, so there is no "mixture". I wonder if it is right. Thank you.

@veydan

This comment has been minimized.

Show comment Hide comment
@veydan

veydan Jul 9, 2017

Hi,

I think this is NOT a GMM-HMM, the code has no "mixture" of multiple Gaussian components for each state, but rather a multivariate Gaussian for each state. Thanks.

veydan commented Jul 9, 2017

Hi,

I think this is NOT a GMM-HMM, the code has no "mixture" of multiple Gaussian components for each state, but rather a multivariate Gaussian for each state. Thanks.

@Rapternmn

This comment has been minimized.

Show comment Hide comment
@Rapternmn

Rapternmn Nov 23, 2017

I Think there is something wrong with this declaration

partial_sum = self.A * np.dot(alpha[:, t], (beta[:, t] * B[:, t + 1]).T)

I think it should be

partial_sum = self.A * np.dot(alpha[:, t], (beta[:, t+1] * B[:, t + 1]).T)

Please tell me what am I missing if I am wrong with this declaration.

I Think there is something wrong with this declaration

partial_sum = self.A * np.dot(alpha[:, t], (beta[:, t] * B[:, t + 1]).T)

I think it should be

partial_sum = self.A * np.dot(alpha[:, t], (beta[:, t+1] * B[:, t + 1]).T)

Please tell me what am I missing if I am wrong with this declaration.

@huangteng1220

This comment has been minimized.

Show comment Hide comment
@huangteng1220

huangteng1220 Feb 2, 2018

I am new to statistical analysis. I will give a detailed description of my problem as follows: I have a 4D data set as follows:

ObjectID The number feature of every Object The length of every feature Types
1 100 64 1
2 100 64 1
3 100 64 2
4 100 64 3
··· ···· ·····

The shape likes (objID, num_feature of every Object, len_feature, Types),e.g.(274500,100,64,10). How should I use hmmlearn or pomegranate HMM for recognition on these 4D data?

I am new to statistical analysis. I will give a detailed description of my problem as follows: I have a 4D data set as follows:

ObjectID The number feature of every Object The length of every feature Types
1 100 64 1
2 100 64 1
3 100 64 2
4 100 64 3
··· ···· ·····

The shape likes (objID, num_feature of every Object, len_feature, Types),e.g.(274500,100,64,10). How should I use hmmlearn or pomegranate HMM for recognition on these 4D data?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment