jcrubino/rbm.py

## rbm.py
"""
Code for training RBMs with contrastive divergence. Tries to be as
quick and memory-efficient as possible while utilizing only pure Python
and NumPy.
"""

# Copyright (c) 2009, David Warde-Farley
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. The name of the author may not be used to endorse or promote products
#    derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import sys
import time

import numpy as np

class RBM(object):
    """
    Class representing a basic restricted Boltzmann machine, with
    binary stochastic visible units and binary stochastic hidden
    units.
    """
    def __init__(self, nvis, nhid, mfvis=True, mfhid=False, initvar=0.1):
        nweights = nvis * nhid
        vb_offset = nweights
        hb_offset = nweights + nvis

        # One parameter matrix, with views onto it specified below.
        self.params = np.empty((nweights + nvis + nhid))

        # Weights between the hiddens and visibles
        self.weights = self.params[:vb_offset].reshape(nvis, nhid)

        # Biases on the visible units
        self.visbias = self.params[vb_offset:hb_offset]

        # Biases on the hidden units
        self.hidbias = self.params[hb_offset:]

        # Attributes for scratch arrays used during sampling.
        self._hid_states = None
        self._vis_states = None

        # Instance-specific mean field settings.
        self._mfvis = mfvis
        self._mfhid = mfhid

    @property
    def numvis(self):
        """The number of visible units (i.e. dimension of the input)."""
        return self.visbias.shape[0]

    @property
    def numhid(self):
        """The number of hidden units in this model."""
        return self.hidbias.shape[0]

    def _prepare_buffer(self, ncases, kind):
        """
        Prepare the _hid_states and _vis_states buffers for
        use for a minibatch of size `ncases`, reshaping or
        reallocating as necessary. `kind` is one of 'hid', 'vis'.
        """
        if kind not in ['hid', 'vis']:
            raise ValueError('kind argument must be hid or vis')
        name = '_%s_states' % kind
        num = getattr(self, 'num%s' % kind)
        buf = getattr(self, name)
        if buf is None or buf.shape[0] < ncases:
            if buf is not None:
                del buf
            buf = np.empty((ncases, num))
            setattr(self, name, buf)
        buf[...] = np.NaN
        return buf[:ncases]

    def hid_activate(self, input, mf=False):
        """
        Activate the hidden units by sampling from their conditional
        distribution given each of the rows of `inputs. If `mf` is True,
        return the deterministic, real-valued probabilities of activation
        in place of stochastic binary samples ('mean-field').
        """
        input = np.atleast_2d(input)
        ncases, ndim = input.shape
        hid = self._prepare_buffer(ncases, 'hid')
        self._update_hidden(input, hid, mf)
        return hid

    def _update_hidden(self, vis, hid, mf=False):
        """
        Update hidden units by writing new values to array `hid`.

        If `mf` is False, hidden unit values are sampled from their
        conditional distribution given the visible unit configurations
        specified in each row of `vis`. If `mf` is True, the
        deterministic, real-valued probabilities of activation are
        written instead of stochastic binary samples ('mean-field').
        """
        hid[...] = np.dot(vis, self.weights)
        hid[...] += self.hidbias
        hid *= -1.
        np.exp(hid, hid)
        hid += 1.
        hid **= -1.
        if not mf:
            self.sample_hid(hid)

    def _update_visible(self, vis, hid, mf=False):
        """
        Update visible units by writing new values to array `hid`.

        If `mf` is False, visible unit values are sampled from their
        conditional distribution given the hidden unit configurations
        specified in each row of `hid`. If `mf` is True, the
        deterministic, real-valued probabilities of activation are
        written instead of stochastic binary samples ('mean-field').
        """

        # Implements 1/(1 + exp(-WX) with in-place operations
        vis[...] = np.dot(hid, self.weights.T)
        vis[...] += self.visbias
        vis *= -1.
        np.exp(vis, vis)
        vis += 1.
        vis **= -1.
        if not mf:
           self.sample_vis(vis)

    @classmethod
    def binary_threshold(cls, probs):
        """
        Given a set of real-valued activation probabilities,
        sample binary values with the given Bernoulli parameter,
        and update the array in-placewith the Bernoulli samples.
        """
        samples = np.random.uniform(size=probs.shape)

        # Simulate Bernoulli trials with p = probs[i,j] by generating random
        # uniform and counting any number less than probs[i,j] as success.
        probs[samples < probs] = 1.

        # Anything not set to 1 should be 0 once floored.
        np.floor(probs, probs)

    # Binary hidden units
    sample_hid = binary_threshold

    # Binary visible units
    sample_vis = binary_threshold

    def gibbs_walk(self, nsteps, hid):
        """
        Perform nsteps of alternating Gibbs sampling,
        sampling the hidden units in parallel followed by the
        visible units.

        Depending on instantiation arguments, one or both sets of
        units may instead have "mean-field" activities computed.
        Mean-field is always used in lieu of sampling for the
        terminal hidden unit configuration.
        """
        hid = np.atleast_2d(hid)
        ncases = hid.shape[0]

        # Allocate (or reuse) a buffer with which to store
        # the states of the visible units
        vis = self._prepare_buffer(ncases, 'vis')

        for iter in xrange(nsteps):

            # Update the visible units conditioning on the hidden units.
            self._update_visible(vis, hid, self._mfvis)

            # Always do mean-field on the last hidden unit update to get a
            # less noisy estimate of the negative phase correlations.
            if iter < nsteps - 1:
                mfhid = self._mfhid
            else:
                mfhid = True

            # Update the hidden units conditioning on the visible units.
            self._update_hidden(vis, hid, mfhid)

        return self._vis_states[:ncases], self._hid_states[:ncases]

class GaussianBinaryRBM(RBM):
    def _update_visible(self, vis, hid, mf=False):
        vis[...] = np.dot(hid, self.weights.T)
        vis += self.visbias
        if not mf:
            self.sample_vis(vis)

    @classmethod
    def sample_vis(self, vis):
        vis += np.random.normal(size=vis.shape)

class CDTrainer(object):
    """An object that trains a model using vanilla contrastive divergence."""

    def __init__(self, model, weightcost=0.0002, rates=(1e-4, 1e-4, 1e-4),
                 cachebatchsums=True):
        self._model = model
        self._visbias_rate, self._hidbias_rate, self._weight_rate = rates
        self._weightcost = weightcost
        self._cachebatchsums = cachebatchsums
        self._weightstep = np.zeros(model.weights.shape)

    def train(self, data, epochs, cdsteps=1, minibatch=50, momentum=0.9):
        """
        Train an RBM with contrastive divergence, using `nsteps`
        steps of alternating Gibbs sampling to draw the negative phase
        samples.
        """
        data = np.atleast_2d(data)
        ncases, ndim = data.shape
        model = self._model

        if self._cachebatchsums:
            batchsums = {}

        for epoch in xrange(epochs):

            # An epoch is a single pass through the training data.

            epoch_start = time.clock()

            # Mean squared error isn't really the right thing to measure
            # for RBMs with binary visible units, but gives a good enough
            # indication of whether things are moving in the right way.

            mse = 0

            # Compute the summed visible activities once

            for offset in xrange(0, ncases, minibatch):

                # Select a minibatch of data.
                batch = data[offset:(offset+minibatch)]

                batchsize = batch.shape[0]

                # Mean field pass on the hidden units f
                hid = model.hid_activate(batch, mf=True)

                # Correlations between the data and the hidden unit activations
                poscorr = np.dot(batch.T, hid)

                # Activities of the hidden units
                posact = hid.sum(axis=0)

                # Threshold the hidden units so that they can't convey
                # more than 1 bit of information in the subsequent
                # sampling (assuming the hidden units are binary,
                # which they most often are).
                model.sample_hid(hid)

                # Simulate Gibbs sampling for a given number of steps.
                vis, hid = model.gibbs_walk(cdsteps, hid)

                # Update the weights with the difference in correlations
                # between the positive and negative phases.

                thisweightstep = poscorr
                thisweightstep -= np.dot(vis.T, hid)
                thisweightstep /= batchsize
                thisweightstep -= self._weightcost * model.weights
                thisweightstep *= self._weight_rate

                self._weightstep *= momentum
                self._weightstep += thisweightstep

                model.weights += self._weightstep

                # The gradient of the visible biases is the difference in
                # summed visible activities for the minibatch.
                if self._cachebatchsums:
                    if offset not in batchsums:
                        batchsum = batch.sum(axis=0)
                        batchsums[offset] = batchsum
                    else:
                        batchsum = batchsums[offset]
                else:
                    batchsum = batch.sum(axis=0)

                visbias_step = batchsum - vis.sum(axis=0)
                visbias_step *= self._visbias_rate / batchsize

                model.visbias += visbias_step

                # The gradient of the hidden biases is the difference in
                # summed hidden activities for the minibatch.

                hidbias_step = posact - hid.sum(axis=0)
                hidbias_step *= self._hidbias_rate / batchsize

                model.hidbias += hidbias_step

                # Compute the squared error in-place.
                vis -= batch
                vis **= 2.

                # Add to the total epoch estimate.
                mse += vis.sum() / ncases

            print "Done epoch %d: %f seconds, MSE=%f" % \
                    (epoch + 1, time.clock() - epoch_start, mse)
            sys.stdout.flush()
	"""
	Code for training RBMs with contrastive divergence. Tries to be as
	quick and memory-efficient as possible while utilizing only pure Python
	and NumPy.
	"""

	# Copyright (c) 2009, David Warde-Farley
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	# 3. The name of the author may not be used to endorse or promote products
	# derived from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	import sys
	import time

	import numpy as np

	class RBM(object):
	"""
	Class representing a basic restricted Boltzmann machine, with
	binary stochastic visible units and binary stochastic hidden
	units.
	"""
	def __init__(self, nvis, nhid, mfvis=True, mfhid=False, initvar=0.1):
	nweights = nvis * nhid
	vb_offset = nweights
	hb_offset = nweights + nvis

	# One parameter matrix, with views onto it specified below.
	self.params = np.empty((nweights + nvis + nhid))

	# Weights between the hiddens and visibles
	self.weights = self.params[:vb_offset].reshape(nvis, nhid)

	# Biases on the visible units
	self.visbias = self.params[vb_offset:hb_offset]

	# Biases on the hidden units
	self.hidbias = self.params[hb_offset:]

	# Attributes for scratch arrays used during sampling.
	self._hid_states = None
	self._vis_states = None

	# Instance-specific mean field settings.
	self._mfvis = mfvis
	self._mfhid = mfhid

	@property
	def numvis(self):
	"""The number of visible units (i.e. dimension of the input)."""
	return self.visbias.shape[0]

	@property
	def numhid(self):
	"""The number of hidden units in this model."""
	return self.hidbias.shape[0]

	def _prepare_buffer(self, ncases, kind):
	"""
	Prepare the _hid_states and _vis_states buffers for
	use for a minibatch of size `ncases`, reshaping or
	reallocating as necessary. `kind` is one of 'hid', 'vis'.
	"""
	if kind not in ['hid', 'vis']:
	raise ValueError('kind argument must be hid or vis')
	name = '_%s_states' % kind
	num = getattr(self, 'num%s' % kind)
	buf = getattr(self, name)
	if buf is None or buf.shape[0] < ncases:
	if buf is not None:
	del buf
	buf = np.empty((ncases, num))
	setattr(self, name, buf)
	buf[...] = np.NaN
	return buf[:ncases]

	def hid_activate(self, input, mf=False):
	"""
	Activate the hidden units by sampling from their conditional
	distribution given each of the rows of `inputs. If `mf` is True,
	return the deterministic, real-valued probabilities of activation
	in place of stochastic binary samples ('mean-field').
	"""
	input = np.atleast_2d(input)
	ncases, ndim = input.shape
	hid = self._prepare_buffer(ncases, 'hid')
	self._update_hidden(input, hid, mf)
	return hid

	def _update_hidden(self, vis, hid, mf=False):
	"""
	Update hidden units by writing new values to array `hid`.

	If `mf` is False, hidden unit values are sampled from their
	conditional distribution given the visible unit configurations
	specified in each row of `vis`. If `mf` is True, the
	deterministic, real-valued probabilities of activation are
	written instead of stochastic binary samples ('mean-field').
	"""
	hid[...] = np.dot(vis, self.weights)
	hid[...] += self.hidbias
	hid *= -1.
	np.exp(hid, hid)
	hid += 1.
	hid **= -1.
	if not mf:
	self.sample_hid(hid)

	def _update_visible(self, vis, hid, mf=False):
	"""
	Update visible units by writing new values to array `hid`.

	If `mf` is False, visible unit values are sampled from their
	conditional distribution given the hidden unit configurations
	specified in each row of `hid`. If `mf` is True, the
	deterministic, real-valued probabilities of activation are
	written instead of stochastic binary samples ('mean-field').
	"""

	# Implements 1/(1 + exp(-WX) with in-place operations
	vis[...] = np.dot(hid, self.weights.T)
	vis[...] += self.visbias
	vis *= -1.
	np.exp(vis, vis)
	vis += 1.
	vis **= -1.
	if not mf:
	self.sample_vis(vis)

	@classmethod
	def binary_threshold(cls, probs):
	"""
	Given a set of real-valued activation probabilities,
	sample binary values with the given Bernoulli parameter,
	and update the array in-placewith the Bernoulli samples.
	"""
	samples = np.random.uniform(size=probs.shape)

	# Simulate Bernoulli trials with p = probs[i,j] by generating random
	# uniform and counting any number less than probs[i,j] as success.
	probs[samples < probs] = 1.

	# Anything not set to 1 should be 0 once floored.
	np.floor(probs, probs)

	# Binary hidden units
	sample_hid = binary_threshold

	# Binary visible units
	sample_vis = binary_threshold

	def gibbs_walk(self, nsteps, hid):
	"""
	Perform nsteps of alternating Gibbs sampling,
	sampling the hidden units in parallel followed by the
	visible units.

	Depending on instantiation arguments, one or both sets of
	units may instead have "mean-field" activities computed.
	Mean-field is always used in lieu of sampling for the
	terminal hidden unit configuration.
	"""
	hid = np.atleast_2d(hid)
	ncases = hid.shape[0]

	# Allocate (or reuse) a buffer with which to store
	# the states of the visible units
	vis = self._prepare_buffer(ncases, 'vis')

	for iter in xrange(nsteps):

	# Update the visible units conditioning on the hidden units.
	self._update_visible(vis, hid, self._mfvis)

	# Always do mean-field on the last hidden unit update to get a
	# less noisy estimate of the negative phase correlations.
	if iter < nsteps - 1:
	mfhid = self._mfhid
	else:
	mfhid = True

	# Update the hidden units conditioning on the visible units.
	self._update_hidden(vis, hid, mfhid)

	return self._vis_states[:ncases], self._hid_states[:ncases]

	class GaussianBinaryRBM(RBM):
	def _update_visible(self, vis, hid, mf=False):
	vis[...] = np.dot(hid, self.weights.T)
	vis += self.visbias
	if not mf:
	self.sample_vis(vis)

	@classmethod
	def sample_vis(self, vis):
	vis += np.random.normal(size=vis.shape)

	class CDTrainer(object):
	"""An object that trains a model using vanilla contrastive divergence."""

	def __init__(self, model, weightcost=0.0002, rates=(1e-4, 1e-4, 1e-4),
	cachebatchsums=True):
	self._model = model
	self._visbias_rate, self._hidbias_rate, self._weight_rate = rates
	self._weightcost = weightcost
	self._cachebatchsums = cachebatchsums
	self._weightstep = np.zeros(model.weights.shape)

	def train(self, data, epochs, cdsteps=1, minibatch=50, momentum=0.9):
	"""
	Train an RBM with contrastive divergence, using `nsteps`
	steps of alternating Gibbs sampling to draw the negative phase
	samples.
	"""
	data = np.atleast_2d(data)
	ncases, ndim = data.shape
	model = self._model

	if self._cachebatchsums:
	batchsums = {}

	for epoch in xrange(epochs):

	# An epoch is a single pass through the training data.

	epoch_start = time.clock()

	# Mean squared error isn't really the right thing to measure
	# for RBMs with binary visible units, but gives a good enough
	# indication of whether things are moving in the right way.

	mse = 0

	# Compute the summed visible activities once

	for offset in xrange(0, ncases, minibatch):

	# Select a minibatch of data.
	batch = data[offset:(offset+minibatch)]

	batchsize = batch.shape[0]

	# Mean field pass on the hidden units f
	hid = model.hid_activate(batch, mf=True)

	# Correlations between the data and the hidden unit activations
	poscorr = np.dot(batch.T, hid)

	# Activities of the hidden units
	posact = hid.sum(axis=0)

	# Threshold the hidden units so that they can't convey
	# more than 1 bit of information in the subsequent
	# sampling (assuming the hidden units are binary,
	# which they most often are).
	model.sample_hid(hid)

	# Simulate Gibbs sampling for a given number of steps.
	vis, hid = model.gibbs_walk(cdsteps, hid)

	# Update the weights with the difference in correlations
	# between the positive and negative phases.

	thisweightstep = poscorr
	thisweightstep -= np.dot(vis.T, hid)
	thisweightstep /= batchsize
	thisweightstep -= self._weightcost * model.weights
	thisweightstep *= self._weight_rate

	self._weightstep *= momentum
	self._weightstep += thisweightstep

	model.weights += self._weightstep

	# The gradient of the visible biases is the difference in
	# summed visible activities for the minibatch.
	if self._cachebatchsums:
	if offset not in batchsums:
	batchsum = batch.sum(axis=0)
	batchsums[offset] = batchsum
	else:
	batchsum = batchsums[offset]
	else:
	batchsum = batch.sum(axis=0)

	visbias_step = batchsum - vis.sum(axis=0)
	visbias_step *= self._visbias_rate / batchsize

	model.visbias += visbias_step

	# The gradient of the hidden biases is the difference in
	# summed hidden activities for the minibatch.

	hidbias_step = posact - hid.sum(axis=0)
	hidbias_step *= self._hidbias_rate / batchsize

	model.hidbias += hidbias_step

	# Compute the squared error in-place.
	vis -= batch
	vis **= 2.

	# Add to the total epoch estimate.
	mse += vis.sum() / ncases

	print "Done epoch %d: %f seconds, MSE=%f" % \
	(epoch + 1, time.clock() - epoch_start, mse)
	sys.stdout.flush()