justheuristic/Hierarchical softmax layer.ipynb

## Hierarchical softmax layer.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Hierarchical softmax layer.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## hierarchical_softmax_layer.py
import numpy as np
import theano.tensor as T
from lasagne import init
from lasagne.layers import Layer,MergeLayer, InputLayer,flatten


class HierarchicalSoftmaxDenseLayer(MergeLayer):
    """

    Wraps theano.tensor.nnet.h_softmax for a more convenient usage as lasagne layer.

    :param incoming: incoming lasagne layer
    :param num_units: the number of outputs
    :param n_classes: the number of intermediate classes of the two-layer hierarchical softmax.
        It corresponds to the number of outputs of the first softmax. See note at
        the end.  Defaults to sqrt(num_units) or can be inferred from n_outputs_per_class.
    :param n_outputs_per_class: the number of outputs per intermediate class.
        See note at the end. int, can be inferred
    :param W1: lasagne init or a tensor of shape (number of features of the input x, n_classes)
        the weight matrix of the first softmax, which maps the input x to the
        probabilities of the classes.
    :param b1: lasagne init or a tensor of shape (n_classes,)
        the bias vector of the first softmax layer.
    :param W2: lasagne init or a tensor of shape
        (n_classes, number of features of the input x, n_outputs_per_class)
        the weight matrix of the second softmax, which maps the input x to
        the probabilities of the outputs.
    :param b2: tensor of shape (n_classes, n_outputs_per_class)
        the bias vector of the second softmax layer.
    :param target: lasagne layer or tensor of shape either (batch_size,) or (batch_size, 1)
        (optional, default None)
        contains the indices of the targets for the minibatch
        input x. For each input, the function computes the output for its
        corresponding target. If target is None, then all the outputs are
        computed for each input.

    Notes
    -----
    The product of n_outputs_per_class and n_classes has to be greater or equal
    to n_outputs. If it is strictly greater, then the irrelevant outputs will
    be ignored.
    n_outputs_per_class and n_classes have to be the same as the corresponding
    dimensions of the tensors of W1, b1, W2 and b2.
    The most computational efficient configuration is when n_outputs_per_class
    and n_classes are equal to the square root of n_outputs.


        """
    def __init__(self,incoming,num_units,
                 n_classes='auto',
                 n_outputs_per_class='auto',
                 W1_init = init.GlorotUniform(),
                 b1_init = init.Constant(0),
                 W2_init = init.GlorotUniform(),
                 b2_init = init.Constant(0),
                 target=None,
                 **kwargs):


        #flatten input layer if it has higher dimensionality
        if len(incoming.output_shape) != 2:
            assert len(incoming.output_shape) >=2
            incoming = flatten(incoming)

        incomings = [incoming]

        #add target if provided (as theano tensor or lasagne layer)
        if target is not None:

            #convert tensor to layer
            if not isinstance(target,Layer):
                assert target.ndim <=2
                if target.ndim ==1:
                    target_shape = (incoming.shape[0],)
                else:
                    target_shape = (incoming.shape[0],1)

                target = InputLayer(target_shape, input_var=target,name="target inputlayer")

            #check shape
            assert len(target.output_shape) <=2
            if len(target.output_shape) ==2:
                assert target.output_shape[1]==1

            incomings.append(target)

        super(HierarchicalSoftmaxDenseLayer,self).__init__(incomings,**kwargs)

        #infer classes
        if n_classes == 'auto':
            if n_outputs_per_class == 'auto':
                n_classes = int(np.ceil(num_units**.5))
            else:
                n_classes = int(np.ceil(float(num_units)/n_outputs_per_class))
        if n_outputs_per_class == 'auto':
            assert n_classes != 'auto'
            n_outputs_per_class = int(np.ceil(float(num_units)/n_classes))

        assert n_classes * n_outputs_per_class >= num_units

        #remember dimensions
        self.num_units = num_units
        self.n_classes = n_classes
        self.n_outputs_per_class = n_outputs_per_class

        #create params
        n_inputs = incoming.output_shape[1]
        self.W1 = self.add_param(W1_init, (n_inputs,self.n_classes),
                                 name="W1")
        self.b1 = self.add_param(b1_init, (self.n_classes,),
                                 name="b1",regularizable=False)
        self.W2 = self.add_param(W2_init, (self.n_classes,n_inputs,self.n_outputs_per_class),
                                 name="W2")
        self.b2 = self.add_param(b2_init, (self.n_classes,self.n_outputs_per_class),
                                 name="b2",regularizable=False)

    def get_output_for(self,inputs,return_probas_anyway=False,**kwargs):
        """
        Returns
        -------
        output_probs: tensor of shape (batch_size, n_outputs) or (batch_size)
            Output of the two-layer hierarchical softmax for input x. If target is
            not specified (None), then all the outputs are computed and the
            returned tensor has shape (batch_size, n_outputs). Otherwise, when
            target is specified, only the corresponding outputs are computed and
            the returned tensor has thus shape (batch_size,).
        return_probas_anyway: if True, returns all probabilities even if target is provided.


        """

        input = inputs[0]

        if len(inputs) == 1 or return_probas_anyway:
            target = None
        else:
            assert len(inputs) ==2
            target = inputs[1]

        return T.nnet.h_softmax(input,input.shape[0],
                                self.num_units,self.n_classes,
                                self.n_outputs_per_class,
                                W1=self.W1,b1=self.b1,
                                W2=self.W2,b2=self.b2,
                                target=target
                               )

    def get_output_shape_for(self,input_shapes,**kwargs):
        if len(input_shapes) ==1:
            return (input_shapes[0][0],self.num_units)
        else:
            return (input_shapes[0][0],)


## License.md

      
    Raw
  

              License.md
            
          
    Feel free to use this layer anywhere and/or include to any library with or without notifying us. tldr, do whatever you want.
Devised by justheuristic@, khalman@ and https://github.com/ADmitri42.

  
## performance_test.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tue Nov  1 22:47:20 2016       \r\n",
      "+------------------------------------------------------+                       \r\n",
      "| NVIDIA-SMI 361.45     Driver Version: 361.45.11      |                       \r\n",
      "|-------------------------------+----------------------+----------------------+\r\n",
      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
      "|===============================+======================+======================|\r\n",
      "|   0  GeForce GTX TIT...  Off  | 0000:02:00.0     Off |                  N/A |\r\n",
      "| 22%   50C    P8    16W / 250W |    584MiB / 12287MiB |      0%      Default |\r\n",
      "+-------------------------------+----------------------+----------------------+\r\n",
      "|   1  GeForce GTX 680     Off  | 0000:03:00.0     N/A |                  N/A |\r\n",
      "| 40%   45C    P8    N/A /  N/A |      8MiB /  2047MiB |     N/A      Default |\r\n",
      "+-------------------------------+----------------------+----------------------+\r\n",
      "                                                                               \r\n",
      "+-----------------------------------------------------------------------------+\r\n",
      "| Processes:                                                       GPU Memory |\r\n",
      "|  GPU       PID  Type  Process name                               Usage      |\r\n",
      "|=============================================================================|\r\n",
      "|    0      3530    C   /home/apanin/anaconda/bin/python               204MiB |\r\n",
      "|    0      7414    C   /home/apanin/anaconda/bin/python               176MiB |\r\n",
      "|    0     16363    C   /home/apanin/anaconda/bin/python               175MiB |\r\n",
      "|    1                  Not Supported                                         |\r\n",
      "+-----------------------------------------------------------------------------+\r\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: THEANO_FLAGS=\"device=gpu0\"\n"
     ]
    }
   ],
   "source": [
    "%env THEANO_FLAGS=\"device=gpu0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5005)\n"
     ]
    }
   ],
   "source": [
    "import theano\n",
    "import theano.tensor as T\n",
    "theano.config.floatX='float32'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from lasagne.layers import *\n",
    "import lasagne\n",
    "from hierarchical_softmax_layer import HierarchicalSoftmaxDenseLayer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "l_in = InputLayer([100,256])\n",
    "target_y = T.ivector()\n",
    "n_outputs = 10**6\n",
    "l_out_softmax = DenseLayer(l_in, n_outputs, nonlinearity=lasagne.nonlinearities.softmax)\n",
    "l_out_hsoftmax = HierarchicalSoftmaxDenseLayer(l_in,n_outputs,target=target_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "data = [(np.random.normal(size=[100,256]).astype('float32'),\n",
    "         np.random.randint(0,n_outputs,size=[100],dtype='int32')) for i in range(100)]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from lasagne.objectives import categorical_crossentropy as loss\n",
    "f_softmax = theano.function([l_in.input_var,target_y],loss(get_output(l_out_softmax),target_y).mean())\n",
    "f_hsoftmax = theano.function([l_in.input_var,target_y],get_output(l_out_hsoftmax).mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.47 s, sys: 859 ms, total: 5.33 s\n",
      "Wall time: 5.33 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#volatile gpu util 80~100%, titanx, NO CNMEM!\n",
    "for batch in data:\n",
    "    f_softmax(*batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 129 ms, sys: 74 ms, total: 203 ms\n",
      "Wall time: 202 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#volatile gpu util 90~100%, titanx, NO CNMEM!\n",
    "for batch in data:\n",
    "    f_hsoftmax(*batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	import numpy as np
	import theano.tensor as T
	from lasagne import init
	from lasagne.layers import Layer,MergeLayer, InputLayer,flatten


	class HierarchicalSoftmaxDenseLayer(MergeLayer):
	"""

	Wraps theano.tensor.nnet.h_softmax for a more convenient usage as lasagne layer.

	:param incoming: incoming lasagne layer
	:param num_units: the number of outputs
	:param n_classes: the number of intermediate classes of the two-layer hierarchical softmax.
	It corresponds to the number of outputs of the first softmax. See note at
	the end. Defaults to sqrt(num_units) or can be inferred from n_outputs_per_class.
	:param n_outputs_per_class: the number of outputs per intermediate class.
	See note at the end. int, can be inferred
	:param W1: lasagne init or a tensor of shape (number of features of the input x, n_classes)
	the weight matrix of the first softmax, which maps the input x to the
	probabilities of the classes.
	:param b1: lasagne init or a tensor of shape (n_classes,)
	the bias vector of the first softmax layer.
	:param W2: lasagne init or a tensor of shape
	(n_classes, number of features of the input x, n_outputs_per_class)
	the weight matrix of the second softmax, which maps the input x to
	the probabilities of the outputs.
	:param b2: tensor of shape (n_classes, n_outputs_per_class)
	the bias vector of the second softmax layer.
	:param target: lasagne layer or tensor of shape either (batch_size,) or (batch_size, 1)
	(optional, default None)
	contains the indices of the targets for the minibatch
	input x. For each input, the function computes the output for its
	corresponding target. If target is None, then all the outputs are
	computed for each input.

	Notes
	-----
	The product of n_outputs_per_class and n_classes has to be greater or equal
	to n_outputs. If it is strictly greater, then the irrelevant outputs will
	be ignored.
	n_outputs_per_class and n_classes have to be the same as the corresponding
	dimensions of the tensors of W1, b1, W2 and b2.
	The most computational efficient configuration is when n_outputs_per_class
	and n_classes are equal to the square root of n_outputs.



	"""
	def __init__(self,incoming,num_units,
	n_classes='auto',
	n_outputs_per_class='auto',
	W1_init = init.GlorotUniform(),
	b1_init = init.Constant(0),
	W2_init = init.GlorotUniform(),
	b2_init = init.Constant(0),
	target=None,
	**kwargs):



	#flatten input layer if it has higher dimensionality
	if len(incoming.output_shape) != 2:
	assert len(incoming.output_shape) >=2
	incoming = flatten(incoming)

	incomings = [incoming]

	#add target if provided (as theano tensor or lasagne layer)
	if target is not None:

	#convert tensor to layer
	if not isinstance(target,Layer):
	assert target.ndim <=2
	if target.ndim ==1:
	target_shape = (incoming.shape[0],)
	else:
	target_shape = (incoming.shape[0],1)

	target = InputLayer(target_shape, input_var=target,name="target inputlayer")

	#check shape
	assert len(target.output_shape) <=2
	if len(target.output_shape) ==2:
	assert target.output_shape[1]==1

	incomings.append(target)

	super(HierarchicalSoftmaxDenseLayer,self).__init__(incomings,**kwargs)

	#infer classes
	if n_classes == 'auto':
	if n_outputs_per_class == 'auto':
	n_classes = int(np.ceil(num_units**.5))
	else:
	n_classes = int(np.ceil(float(num_units)/n_outputs_per_class))
	if n_outputs_per_class == 'auto':
	assert n_classes != 'auto'
	n_outputs_per_class = int(np.ceil(float(num_units)/n_classes))

	assert n_classes * n_outputs_per_class >= num_units

	#remember dimensions
	self.num_units = num_units
	self.n_classes = n_classes
	self.n_outputs_per_class = n_outputs_per_class

	#create params
	n_inputs = incoming.output_shape[1]
	self.W1 = self.add_param(W1_init, (n_inputs,self.n_classes),
	name="W1")
	self.b1 = self.add_param(b1_init, (self.n_classes,),
	name="b1",regularizable=False)
	self.W2 = self.add_param(W2_init, (self.n_classes,n_inputs,self.n_outputs_per_class),
	name="W2")
	self.b2 = self.add_param(b2_init, (self.n_classes,self.n_outputs_per_class),
	name="b2",regularizable=False)

	def get_output_for(self,inputs,return_probas_anyway=False,**kwargs):
	"""
	Returns
	-------
	output_probs: tensor of shape (batch_size, n_outputs) or (batch_size)
	Output of the two-layer hierarchical softmax for input x. If target is
	not specified (None), then all the outputs are computed and the
	returned tensor has shape (batch_size, n_outputs). Otherwise, when
	target is specified, only the corresponding outputs are computed and
	the returned tensor has thus shape (batch_size,).
	return_probas_anyway: if True, returns all probabilities even if target is provided.


	"""

	input = inputs[0]

	if len(inputs) == 1 or return_probas_anyway:
	target = None
	else:
	assert len(inputs) ==2
	target = inputs[1]

	return T.nnet.h_softmax(input,input.shape[0],
	self.num_units,self.n_classes,
	self.n_outputs_per_class,
	W1=self.W1,b1=self.b1,
	W2=self.W2,b2=self.b2,
	target=target
	)

	def get_output_shape_for(self,input_shapes,**kwargs):
	if len(input_shapes) ==1:
	return (input_shapes[0][0],self.num_units)
	else:
	return (input_shapes[0][0],)
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Tue Nov 1 22:47:20 2016 \r\n",
	"+------------------------------------------------------+ \r\n",
	"\| NVIDIA-SMI 361.45 Driver Version: 361.45.11 \| \r\n",
	"\|-------------------------------+----------------------+----------------------+\r\n",
	"\| GPU Name Persistence-M\| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\r\n",
	"\| Fan Temp Perf Pwr:Usage/Cap\| Memory-Usage \| GPU-Util Compute M. \|\r\n",
	"\|===============================+======================+======================\|\r\n",
	"\| 0 GeForce GTX TIT... Off \| 0000:02:00.0 Off \| N/A \|\r\n",
	"\| 22% 50C P8 16W / 250W \| 584MiB / 12287MiB \| 0% Default \|\r\n",
	"+-------------------------------+----------------------+----------------------+\r\n",
	"\| 1 GeForce GTX 680 Off \| 0000:03:00.0 N/A \| N/A \|\r\n",
	"\| 40% 45C P8 N/A / N/A \| 8MiB / 2047MiB \| N/A Default \|\r\n",
	"+-------------------------------+----------------------+----------------------+\r\n",
	" \r\n",
	"+-----------------------------------------------------------------------------+\r\n",
	"\| Processes: GPU Memory \|\r\n",
	"\| GPU PID Type Process name Usage \|\r\n",
	"\|=============================================================================\|\r\n",
	"\| 0 3530 C /home/apanin/anaconda/bin/python 204MiB \|\r\n",
	"\| 0 7414 C /home/apanin/anaconda/bin/python 176MiB \|\r\n",
	"\| 0 16363 C /home/apanin/anaconda/bin/python 175MiB \|\r\n",
	"\| 1 Not Supported \|\r\n",
	"+-----------------------------------------------------------------------------+\r\n"
	]
	}
	],
	"source": [
	"!nvidia-smi"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"env: THEANO_FLAGS=\"device=gpu0\"\n"
	]
	}
	],
	"source": [
	"%env THEANO_FLAGS=\"device=gpu0\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5005)\n"
	]
	}
	],
	"source": [
	"import theano\n",
	"import theano.tensor as T\n",
	"theano.config.floatX='float32'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from lasagne.layers import *\n",
	"import lasagne\n",
	"from hierarchical_softmax_layer import HierarchicalSoftmaxDenseLayer"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"l_in = InputLayer([100,256])\n",
	"target_y = T.ivector()\n",
	"n_outputs = 10**6\n",
	"l_out_softmax = DenseLayer(l_in, n_outputs, nonlinearity=lasagne.nonlinearities.softmax)\n",
	"l_out_hsoftmax = HierarchicalSoftmaxDenseLayer(l_in,n_outputs,target=target_y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"data = [(np.random.normal(size=[100,256]).astype('float32'),\n",
	" np.random.randint(0,n_outputs,size=[100],dtype='int32')) for i in range(100)]\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from lasagne.objectives import categorical_crossentropy as loss\n",
	"f_softmax = theano.function([l_in.input_var,target_y],loss(get_output(l_out_softmax),target_y).mean())\n",
	"f_hsoftmax = theano.function([l_in.input_var,target_y],get_output(l_out_hsoftmax).mean())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 4.47 s, sys: 859 ms, total: 5.33 s\n",
	"Wall time: 5.33 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"#volatile gpu util 80~100%, titanx, NO CNMEM!\n",
	"for batch in data:\n",
	" f_softmax(*batch)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 129 ms, sys: 74 ms, total: 203 ms\n",
	"Wall time: 202 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"#volatile gpu util 90~100%, titanx, NO CNMEM!\n",
	"for batch in data:\n",
	" f_hsoftmax(*batch)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}