cheind/hmm_train_mxnet_imp.py

## hmm_train_mxnet_imp.py
__author__ = 'Christoph Heindl'
__copyright__ = 'Copyright 2017'
__license__ = 'BSD'

"""Trains a HMM based on gradient descent optimization.

The parameters (theta) of the model are transition and
emission probabilities, as well as the initial state probabilities.

Given a start solution, the negative log likelihood of data given the
model P(obs|theta) is minimized.

Since we cannot guarantee valid probability distributions after a gradient
step, we add an additional layer to the trainable variables that turns
them into probability distributions (here softmax is used). The optimized
distributions are then the softmaxed version of optimized variables.
"""

import mxnet as mx
import numpy as np


def log_likelihood(A, B, pi, obs):
    '''Computes the log-likelihood of the HMM as log(P(obs|theta)).

    The logarithmic version is preferred in optimization as it avoids
    underflows and scales the loss appropriately, so that backward sweeps
    are stay within meaningful ranges.

    See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf
    for details.
    '''

    AT = mx.ndarray.transpose(A)

    def forward(prev, o):
        f = B[o].reshape((4,1)) * (mx.ndarray.dot(AT, prev))
        c = 1. / f.sum()
        return f * c, c

    f = pi
    s = mx.ndarray.zeros(1, ctx=A.context)
    for i, o in enumerate(obs):
        f, c = forward(f, o)
        s = s + c.log()
    return -s

np.set_printoptions(precision=3, suppress=True)
model_ctx = mx.gpu(0)

# Transition probabilities
A_ = mx.ndarray.array(
    np.array([
        [0.8, 0.2, 0.0, 0.0],
        [0.1, 0.5, 0.4, 0.0],
        [0.0, 0.4, 0.2, 0.4],
        [0.0, 0.0, 0.4, 0.6],
    ]), dtype=np.float32, ctx=mx.gpu(0)
)

# Emission probabilities
B_ = mx.ndarray.array(
    np.array([
        [0.7, 0.1, 0.2, 0.1],
        [0.15, 0.4, 0.7, 0.3],
        [0.15, 0.5, 0.1, 0.6]
    ]), dtype=np.float32, ctx=mx.gpu(0)
)

# Initial state probabilities
pi = mx.ndarray.array([0.6, 0.1, 0.2, 0.1], dtype=np.float32, ctx=mx.gpu(0))

params = [A_, B_]
for p in params:
    p.attach_grad()

def step(params, lr):
    for p in params:
        p[:] = p[:] - lr * p.grad

obs = [0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0]
for i in range(1000):

    with mx.autograd.record():
        A = mx.ndarray.softmax(A_, axis=1)
        B = mx.ndarray.softmax(B_, axis=0)
        ll = log_likelihood(A, B, pi, obs)
        loss = -ll
    loss.backward()
    step(params, 1e-2)

    if i % 100 == 0:
        print('{} - log-likelihood {}'.format(i, ll))

print(mx.ndarray.softmax(A_, axis=1))
print(mx.ndarray.softmax(B_, axis=0))
	__author__ = 'Christoph Heindl'
	__copyright__ = 'Copyright 2017'
	__license__ = 'BSD'

	"""Trains a HMM based on gradient descent optimization.

	The parameters (theta) of the model are transition and
	emission probabilities, as well as the initial state probabilities.

	Given a start solution, the negative log likelihood of data given the
	model P(obs\|theta) is minimized.

	Since we cannot guarantee valid probability distributions after a gradient
	step, we add an additional layer to the trainable variables that turns
	them into probability distributions (here softmax is used). The optimized
	distributions are then the softmaxed version of optimized variables.
	"""

	import mxnet as mx
	import numpy as np


	def log_likelihood(A, B, pi, obs):
	'''Computes the log-likelihood of the HMM as log(P(obs\|theta)).

	The logarithmic version is preferred in optimization as it avoids
	underflows and scales the loss appropriately, so that backward sweeps
	are stay within meaningful ranges.

	See http://courses.media.mit.edu/2010fall/mas622j/ProblemSets/ps4/tutorial.pdf
	for details.
	'''

	AT = mx.ndarray.transpose(A)

	def forward(prev, o):
	f = B[o].reshape((4,1)) * (mx.ndarray.dot(AT, prev))
	c = 1. / f.sum()
	return f * c, c

	f = pi
	s = mx.ndarray.zeros(1, ctx=A.context)
	for i, o in enumerate(obs):
	f, c = forward(f, o)
	s = s + c.log()
	return -s

	np.set_printoptions(precision=3, suppress=True)
	model_ctx = mx.gpu(0)

	# Transition probabilities
	A_ = mx.ndarray.array(
	np.array([
	[0.8, 0.2, 0.0, 0.0],
	[0.1, 0.5, 0.4, 0.0],
	[0.0, 0.4, 0.2, 0.4],
	[0.0, 0.0, 0.4, 0.6],
	]), dtype=np.float32, ctx=mx.gpu(0)
	)

	# Emission probabilities
	B_ = mx.ndarray.array(
	np.array([
	[0.7, 0.1, 0.2, 0.1],
	[0.15, 0.4, 0.7, 0.3],
	[0.15, 0.5, 0.1, 0.6]
	]), dtype=np.float32, ctx=mx.gpu(0)
	)

	# Initial state probabilities
	pi = mx.ndarray.array([0.6, 0.1, 0.2, 0.1], dtype=np.float32, ctx=mx.gpu(0))

	params = [A_, B_]
	for p in params:
	p.attach_grad()

	def step(params, lr):
	for p in params:
	p[:] = p[:] - lr * p.grad

	obs = [0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 0]
	for i in range(1000):

	with mx.autograd.record():
	A = mx.ndarray.softmax(A_, axis=1)
	B = mx.ndarray.softmax(B_, axis=0)
	ll = log_likelihood(A, B, pi, obs)
	loss = -ll
	loss.backward()
	step(params, 1e-2)

	if i % 100 == 0:
	print('{} - log-likelihood {}'.format(i, ll))

	print(mx.ndarray.softmax(A_, axis=1))
	print(mx.ndarray.softmax(B_, axis=0))