madrugado/image_captioning.ipynb

## image_captioning.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "<h1 align=\"center\"> Image Captioning </h1> \n",
    "\n",
    "To begin with, let us download the dataset of image features from a pre-trained GoogleNet."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!wget https://www.dropbox.com/s/d50pqlm19c6f6w5/data.tar.gz?dl=0 -O data.tar.gz\n",
    "!tar -xvzf data.tar.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "# Read Dataset\n",
    "import numpy as np\n",
    "import pickle\n",
    "\n",
    "img_codes = np.load(\"data/image_codes.npy\")\n",
    "captions = pickle.load(open('data/caption_tokens.pcl', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print \"each image code is a 1000-unit vector:\", img_codes.shape\n",
    "print img_codes[0,:10]\n",
    "print '\\n\\n'\n",
    "print \"for each image there are 5-7 descriptions, e.g.:\\n\"\n",
    "print '\\n'.join(captions[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#split descriptions into tokens\n",
    "for img_i in range(len(captions)):\n",
    "    for caption_i in range(len(captions[img_i])):\n",
    "        sentence = captions[img_i][caption_i] \n",
    "        captions[img_i][caption_i] = [\"#START#\"]+sentence.split(' ')+[\"#END#\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Build a Vocabulary\n",
    "\n",
    "############# TO CODE IT BY YOURSELF ##################\n",
    "word_counts = <here should be dict word:number of entrances>\n",
    "\n",
    "vocab  = ['#UNK#', '#START#', '#END#']\n",
    "vocab += [k for k, v in word_counts.items() if v >= 5]\n",
    "n_tokens = len(vocab)\n",
    "\n",
    "assert 10000 <= n_tokens <= 10500\n",
    "\n",
    "word_to_index = {w: i for i, w in enumerate(vocab)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "PAD_ix = -1\n",
    "UNK_ix = vocab.index('#UNK#')\n",
    "\n",
    "def as_matrix(sequences,max_len=None):\n",
    "    max_len = max_len or max(map(len,sequences))\n",
    "    \n",
    "    matrix = np.zeros((len(sequences),max_len),dtype='int32')+PAD_ix\n",
    "    for i,seq in enumerate(sequences):\n",
    "        row_ix = [word_to_index.get(word,UNK_ix) for word in seq[:max_len]]\n",
    "        matrix[i,:len(row_ix)] = row_ix\n",
    "    \n",
    "    return matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#try it out on several descriptions of a random image\n",
    "as_matrix(captions[1337])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### My Neural Network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# network shapes. \n",
    "CNN_FEATURE_SIZE = img_codes.shape[1]\n",
    "EMBED_SIZE = 128 #pls change me if u want\n",
    "LSTM_UNITS = 200 #pls change me if u want"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import theano\n",
    "import lasagne\n",
    "import theano.tensor as T\n",
    "from lasagne.layers import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Input Variable\n",
    "sentences = T.imatrix()# [batch_size x time] of word ids\n",
    "image_vectors = T.matrix() # [batch size x unit] of CNN image features\n",
    "sentence_mask = T.neq(sentences, PAD_ix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#network inputs\n",
    "l_words = InputLayer((None, None), sentences)\n",
    "l_mask = InputLayer((None, None), sentence_mask)\n",
    "\n",
    "#embeddings for words \n",
    "############# TO CODE IT BY YOURSELF ##################\n",
    "l_word_embeddings = <Embedding Layer>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# input layer for image features\n",
    "l_image_features = InputLayer((None, CNN_FEATURE_SIZE), image_vectors)\n",
    "\n",
    "############# TO CODE IT BY YOURSELF ##################\n",
    "#convert 1000 image features from googlenet to whatever LSTM_UNITS you have set\n",
    "#it's also a good idea to add some dropout here and there\n",
    "l_image_features_small = <Apply Dropout to regularise your Net>\n",
    "l_image_features_small = <Apply linear to get LSTM_UNITS size representation>\n",
    "assert l_image_features_small.shape == (None, LSTM_UNITS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "############# TO CODE IT BY YOURSELF ##################\n",
    "# Concatinate image features and word embedings in one sequence \n",
    "decoder = LSTMLayer(<What should be here?>,\n",
    "                    num_units=LSTM_UNITS,\n",
    "                    cell_init=<Use your brain =)>,\n",
    "                    mask_input=<Mask?>,\n",
    "                    grad_clipping=<boom grads>)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Decoding of rnn hiden states\n",
    "from broadcast import BroadcastLayer,UnbroadcastLayer\n",
    "\n",
    "#apply whatever comes next to each tick of each example in a batch. Equivalent to 2 reshapes\n",
    "broadcast_decoder_ticks = BroadcastLayer(decoder, (0, 1))\n",
    "print \"broadcasted decoder shape = \", broadcast_decoder_ticks.output_shape\n",
    "\n",
    "predicted_probabilities_each_tick = DenseLayer(broadcast_decoder_ticks,\n",
    "                                               n_tokens, \n",
    "                                               nonlinearity=lasagne.nonlinearities.softmax)\n",
    "\n",
    "#un-broadcast back into (batch,tick,probabilities)\n",
    "predicted_probabilities = UnbroadcastLayer(predicted_probabilities_each_tick, \n",
    "                                           broadcast_layer=broadcast_decoder_ticks)\n",
    "\n",
    "print \"output shape = \", predicted_probabilities.output_shape\n",
    "\n",
    "#remove if you know what you're doing (e.g. 1d convolutions or fixed shape)\n",
    "assert predicted_probabilities.output_shape == (None, None, 10373)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "next_word_probas = get_output(predicted_probabilities)\n",
    "\n",
    "reference_answers = sentences[:,1:]\n",
    "output_mask = sentence_mask[:,1:]\n",
    "\n",
    "#write symbolic loss function to train NN for\n",
    "loss = lasagne.objectives.categorical_crossentropy(\n",
    "    next_word_probas[:, :-1].reshape((-1, n_tokens)),\n",
    "    reference_answers.reshape((-1,))\n",
    ").reshape(reference_answers.shape)\n",
    "\n",
    "############# TO CODE IT BY YOURSELF ##################\n",
    "loss = <mean over non-PAD tokens>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#trainable NN weights\n",
    "############# TO CODE IT BY YOURSELF ##################\n",
    "weights = <all dnn weigts>\n",
    "updates = <your favorite optimizer>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#compile a function that takes input sentence and image mask, outputs loss and updates weights\n",
    "#please not that your functions must accept image features as FIRST param and sentences as second one\n",
    "############# TO CODE IT BY YOURSELF ##################\n",
    "train_step = <>\n",
    "val_step   = <>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# Training\n",
    "\n",
    "* You first have to implement a batch generator\n",
    "* Than the network will get trained the usual way"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "captions = np.array(captions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from random import choice\n",
    "\n",
    "def generate_batch(images,captions,batch_size,max_caption_len=None):\n",
    "    #sample random numbers for image/caption indicies\n",
    "    random_image_ix = np.random.randint(0, len(images), size=batch_size)\n",
    "    \n",
    "    #get images\n",
    "    batch_images = images[random_image_ix]\n",
    "    \n",
    "    #5-7 captions for each image\n",
    "    captions_for_batch_images = captions[random_image_ix]\n",
    "    \n",
    "    #pick 1 from 5-7 captions for each image\n",
    "    batch_captions = map(choice, captions_for_batch_images)\n",
    "    \n",
    "    #convert to matrix\n",
    "    batch_captions_ix = as_matrix(batch_captions,max_len=max_caption_len)\n",
    "    \n",
    "    return batch_images, batch_captions_ix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "generate_batch(img_codes,captions, 3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Main loop\n",
    "* We recommend you to periodically evaluate the network using the next \"apply trained model\" block\n",
    " *  its safe to interrupt training, run a few examples and start training again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "batch_size = 50 #adjust me\n",
    "n_epochs   = 100 #adjust me\n",
    "n_batches_per_epoch = 50 #adjust me\n",
    "n_validation_batches = 5 #how many batches are used for validation after each epoch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for epoch in range(n_epochs):\n",
    "    train_loss=0\n",
    "    for _ in tqdm(range(n_batches_per_epoch)):\n",
    "        train_loss += train_step(*generate_batch(img_codes,captions,batch_size))\n",
    "    train_loss /= n_batches_per_epoch\n",
    "    \n",
    "    val_loss=0\n",
    "    for _ in range(n_validation_batches):\n",
    "        val_loss += val_step(*generate_batch(img_codes,captions,batch_size))\n",
    "    val_loss /= n_validation_batches\n",
    "    \n",
    "    print('\\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))\n",
    "\n",
    "print(\"Finish :)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "### apply trained model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#the same kind you did last week, but a bit smaller\n",
    "from pretrained_lenet import build_model,preprocess,MEAN_VALUES\n",
    "\n",
    "# build googlenet\n",
    "lenet = build_model()\n",
    "\n",
    "#load weights\n",
    "lenet_weights = pickle.load(open('data/blvc_googlenet.pkl'))['param values']\n",
    "set_all_param_values(lenet[\"prob\"], lenet_weights)\n",
    "\n",
    "#compile get_features\n",
    "cnn_input_var = lenet['input'].input_var\n",
    "cnn_feature_layer = lenet['loss3/classifier']\n",
    "get_cnn_features = theano.function([cnn_input_var], lasagne.layers.get_output(cnn_feature_layer))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "#sample image\n",
    "img = plt.imread('data/Dog-and-Cat.jpg')\n",
    "img = preprocess(img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#deprocess and show, one line :)\n",
    "from pretrained_lenet import MEAN_VALUES\n",
    "plt.imshow(np.transpose((img[0] + MEAN_VALUES)[::-1],[1,2,0]).astype('uint8'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate caption"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "last_word_probas_det = get_output(predicted_probabilities,deterministic=False)[:,-1]\n",
    "\n",
    "get_probs = theano.function([image_vectors,sentences], last_word_probas_det)\n",
    "\n",
    "#this is exactly the generation function from week5 classwork,\n",
    "#except now we condition on image features instead of words\n",
    "def generate_caption(image,caption_prefix = (\"START\",),t=1,sample=True,max_len=100):\n",
    "    image_features = get_cnn_features(image)\n",
    "    caption = list(caption_prefix)\n",
    "    for _ in range(max_len):\n",
    "        \n",
    "        next_word_probs = get_probs(image_features,as_matrix([caption]) ).ravel()\n",
    "        #apply temperature\n",
    "        next_word_probs = next_word_probs**t / np.sum(next_word_probs**t)\n",
    "\n",
    "        if sample:\n",
    "            next_word = np.random.choice(vocab,p=next_word_probs) \n",
    "        else:\n",
    "            next_word = vocab[np.argmax(next_word_probs)]\n",
    "\n",
    "        caption.append(next_word)\n",
    "\n",
    "        if next_word==\"#END#\":\n",
    "            break\n",
    "            \n",
    "    return caption"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i in range(10):\n",
    "    print ' '.join(generate_caption(img,t=1.)[1:-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bonus Part\n",
    "- Use ResNet Instead of GoogLeNet\n",
    "- Use W2V as embedding\n",
    "- Use Attention"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:python2]",
   "language": "python",
   "name": "conda-env-python2-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": false
	},
	"source": [
	"<h1 align=\"center\"> Image Captioning </h1> \n",
	"\n",
	"To begin with, let us download the dataset of image features from a pre-trained GoogleNet."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"!wget https://www.dropbox.com/s/d50pqlm19c6f6w5/data.tar.gz?dl=0 -O data.tar.gz\n",
	"!tar -xvzf data.tar.gz"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Data preprocessing"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"%%time\n",
	"# Read Dataset\n",
	"import numpy as np\n",
	"import pickle\n",
	"\n",
	"img_codes = np.load(\"data/image_codes.npy\")\n",
	"captions = pickle.load(open('data/caption_tokens.pcl', 'rb'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"print \"each image code is a 1000-unit vector:\", img_codes.shape\n",
	"print img_codes[0,:10]\n",
	"print '\\n\\n'\n",
	"print \"for each image there are 5-7 descriptions, e.g.:\\n\"\n",
	"print '\\n'.join(captions[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#split descriptions into tokens\n",
	"for img_i in range(len(captions)):\n",
	" for caption_i in range(len(captions[img_i])):\n",
	" sentence = captions[img_i][caption_i] \n",
	" captions[img_i][caption_i] = [\"#START#\"]+sentence.split(' ')+[\"#END#\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Build a Vocabulary\n",
	"\n",
	"############# TO CODE IT BY YOURSELF ##################\n",
	"word_counts = <here should be dict word:number of entrances>\n",
	"\n",
	"vocab = ['#UNK#', '#START#', '#END#']\n",
	"vocab += [k for k, v in word_counts.items() if v >= 5]\n",
	"n_tokens = len(vocab)\n",
	"\n",
	"assert 10000 <= n_tokens <= 10500\n",
	"\n",
	"word_to_index = {w: i for i, w in enumerate(vocab)}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"PAD_ix = -1\n",
	"UNK_ix = vocab.index('#UNK#')\n",
	"\n",
	"def as_matrix(sequences,max_len=None):\n",
	" max_len = max_len or max(map(len,sequences))\n",
	" \n",
	" matrix = np.zeros((len(sequences),max_len),dtype='int32')+PAD_ix\n",
	" for i,seq in enumerate(sequences):\n",
	" row_ix = [word_to_index.get(word,UNK_ix) for word in seq[:max_len]]\n",
	" matrix[i,:len(row_ix)] = row_ix\n",
	" \n",
	" return matrix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#try it out on several descriptions of a random image\n",
	"as_matrix(captions[1337])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### My Neural Network"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# network shapes. \n",
	"CNN_FEATURE_SIZE = img_codes.shape[1]\n",
	"EMBED_SIZE = 128 #pls change me if u want\n",
	"LSTM_UNITS = 200 #pls change me if u want"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import theano\n",
	"import lasagne\n",
	"import theano.tensor as T\n",
	"from lasagne.layers import *"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Input Variable\n",
	"sentences = T.imatrix()# [batch_size x time] of word ids\n",
	"image_vectors = T.matrix() # [batch size x unit] of CNN image features\n",
	"sentence_mask = T.neq(sentences, PAD_ix)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#network inputs\n",
	"l_words = InputLayer((None, None), sentences)\n",
	"l_mask = InputLayer((None, None), sentence_mask)\n",
	"\n",
	"#embeddings for words \n",
	"############# TO CODE IT BY YOURSELF ##################\n",
	"l_word_embeddings = <Embedding Layer>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# input layer for image features\n",
	"l_image_features = InputLayer((None, CNN_FEATURE_SIZE), image_vectors)\n",
	"\n",
	"############# TO CODE IT BY YOURSELF ##################\n",
	"#convert 1000 image features from googlenet to whatever LSTM_UNITS you have set\n",
	"#it's also a good idea to add some dropout here and there\n",
	"l_image_features_small = <Apply Dropout to regularise your Net>\n",
	"l_image_features_small = <Apply linear to get LSTM_UNITS size representation>\n",
	"assert l_image_features_small.shape == (None, LSTM_UNITS)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"############# TO CODE IT BY YOURSELF ##################\n",
	"# Concatinate image features and word embedings in one sequence \n",
	"decoder = LSTMLayer(<What should be here?>,\n",
	" num_units=LSTM_UNITS,\n",
	" cell_init=<Use your brain =)>,\n",
	" mask_input=<Mask?>,\n",
	" grad_clipping=<boom grads>)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Decoding of rnn hiden states\n",
	"from broadcast import BroadcastLayer,UnbroadcastLayer\n",
	"\n",
	"#apply whatever comes next to each tick of each example in a batch. Equivalent to 2 reshapes\n",
	"broadcast_decoder_ticks = BroadcastLayer(decoder, (0, 1))\n",
	"print \"broadcasted decoder shape = \", broadcast_decoder_ticks.output_shape\n",
	"\n",
	"predicted_probabilities_each_tick = DenseLayer(broadcast_decoder_ticks,\n",
	" n_tokens, \n",
	" nonlinearity=lasagne.nonlinearities.softmax)\n",
	"\n",
	"#un-broadcast back into (batch,tick,probabilities)\n",
	"predicted_probabilities = UnbroadcastLayer(predicted_probabilities_each_tick, \n",
	" broadcast_layer=broadcast_decoder_ticks)\n",
	"\n",
	"print \"output shape = \", predicted_probabilities.output_shape\n",
	"\n",
	"#remove if you know what you're doing (e.g. 1d convolutions or fixed shape)\n",
	"assert predicted_probabilities.output_shape == (None, None, 10373)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"next_word_probas = get_output(predicted_probabilities)\n",
	"\n",
	"reference_answers = sentences[:,1:]\n",
	"output_mask = sentence_mask[:,1:]\n",
	"\n",
	"#write symbolic loss function to train NN for\n",
	"loss = lasagne.objectives.categorical_crossentropy(\n",
	" next_word_probas[:, :-1].reshape((-1, n_tokens)),\n",
	" reference_answers.reshape((-1,))\n",
	").reshape(reference_answers.shape)\n",
	"\n",
	"############# TO CODE IT BY YOURSELF ##################\n",
	"loss = <mean over non-PAD tokens>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#trainable NN weights\n",
	"############# TO CODE IT BY YOURSELF ##################\n",
	"weights = <all dnn weigts>\n",
	"updates = <your favorite optimizer>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#compile a function that takes input sentence and image mask, outputs loss and updates weights\n",
	"#please not that your functions must accept image features as FIRST param and sentences as second one\n",
	"############# TO CODE IT BY YOURSELF ##################\n",
	"train_step = <>\n",
	"val_step = <>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": false
	},
	"source": [
	"# Training\n",
	"\n",
	"* You first have to implement a batch generator\n",
	"* Than the network will get trained the usual way"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"captions = np.array(captions)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from random import choice\n",
	"\n",
	"def generate_batch(images,captions,batch_size,max_caption_len=None):\n",
	" #sample random numbers for image/caption indicies\n",
	" random_image_ix = np.random.randint(0, len(images), size=batch_size)\n",
	" \n",
	" #get images\n",
	" batch_images = images[random_image_ix]\n",
	" \n",
	" #5-7 captions for each image\n",
	" captions_for_batch_images = captions[random_image_ix]\n",
	" \n",
	" #pick 1 from 5-7 captions for each image\n",
	" batch_captions = map(choice, captions_for_batch_images)\n",
	" \n",
	" #convert to matrix\n",
	" batch_captions_ix = as_matrix(batch_captions,max_len=max_caption_len)\n",
	" \n",
	" return batch_images, batch_captions_ix"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"generate_batch(img_codes,captions, 3)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Main loop\n",
	"* We recommend you to periodically evaluate the network using the next \"apply trained model\" block\n",
	" * its safe to interrupt training, run a few examples and start training again"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"batch_size = 50 #adjust me\n",
	"n_epochs = 100 #adjust me\n",
	"n_batches_per_epoch = 50 #adjust me\n",
	"n_validation_batches = 5 #how many batches are used for validation after each epoch"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"from tqdm import tqdm\n",
	"\n",
	"for epoch in range(n_epochs):\n",
	" train_loss=0\n",
	" for _ in tqdm(range(n_batches_per_epoch)):\n",
	" train_loss += train_step(*generate_batch(img_codes,captions,batch_size))\n",
	" train_loss /= n_batches_per_epoch\n",
	" \n",
	" val_loss=0\n",
	" for _ in range(n_validation_batches):\n",
	" val_loss += val_step(*generate_batch(img_codes,captions,batch_size))\n",
	" val_loss /= n_validation_batches\n",
	" \n",
	" print('\\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))\n",
	"\n",
	"print(\"Finish :)\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": false
	},
	"source": [
	"### apply trained model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#the same kind you did last week, but a bit smaller\n",
	"from pretrained_lenet import build_model,preprocess,MEAN_VALUES\n",
	"\n",
	"# build googlenet\n",
	"lenet = build_model()\n",
	"\n",
	"#load weights\n",
	"lenet_weights = pickle.load(open('data/blvc_googlenet.pkl'))['param values']\n",
	"set_all_param_values(lenet[\"prob\"], lenet_weights)\n",
	"\n",
	"#compile get_features\n",
	"cnn_input_var = lenet['input'].input_var\n",
	"cnn_feature_layer = lenet['loss3/classifier']\n",
	"get_cnn_features = theano.function([cnn_input_var], lasagne.layers.get_output(cnn_feature_layer))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from matplotlib import pyplot as plt\n",
	"%matplotlib inline\n",
	"\n",
	"#sample image\n",
	"img = plt.imread('data/Dog-and-Cat.jpg')\n",
	"img = preprocess(img)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#deprocess and show, one line :)\n",
	"from pretrained_lenet import MEAN_VALUES\n",
	"plt.imshow(np.transpose((img[0] + MEAN_VALUES)[::-1],[1,2,0]).astype('uint8'))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Generate caption"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"last_word_probas_det = get_output(predicted_probabilities,deterministic=False)[:,-1]\n",
	"\n",
	"get_probs = theano.function([image_vectors,sentences], last_word_probas_det)\n",
	"\n",
	"#this is exactly the generation function from week5 classwork,\n",
	"#except now we condition on image features instead of words\n",
	"def generate_caption(image,caption_prefix = (\"START\",),t=1,sample=True,max_len=100):\n",
	" image_features = get_cnn_features(image)\n",
	" caption = list(caption_prefix)\n",
	" for _ in range(max_len):\n",
	" \n",
	" next_word_probs = get_probs(image_features,as_matrix([caption]) ).ravel()\n",
	" #apply temperature\n",
	" next_word_probs = next_word_probst / np.sum(next_word_probst)\n",
	"\n",
	" if sample:\n",
	" next_word = np.random.choice(vocab,p=next_word_probs) \n",
	" else:\n",
	" next_word = vocab[np.argmax(next_word_probs)]\n",
	"\n",
	" caption.append(next_word)\n",
	"\n",
	" if next_word==\"#END#\":\n",
	" break\n",
	" \n",
	" return caption"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"for i in range(10):\n",
	" print ' '.join(generate_caption(img,t=1.)[1:-1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Bonus Part\n",
	"- Use ResNet Instead of GoogLeNet\n",
	"- Use W2V as embedding\n",
	"- Use Attention"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda env:python2]",
	"language": "python",
	"name": "conda-env-python2-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}