Last active
May 16, 2017 18:23
-
-
Save madrugado/046220bfdad45b5fcef5dbb41e56ddb3 to your computer and use it in GitHub Desktop.
Image Captioning Homework
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false | |
}, | |
"source": [ | |
"<h1 align=\"center\"> Image Captioning </h1> \n", | |
"\n", | |
"To begin with, let us download the dataset of image features from a pre-trained GoogleNet." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"!wget https://www.dropbox.com/s/d50pqlm19c6f6w5/data.tar.gz?dl=0 -O data.tar.gz\n", | |
"!tar -xvzf data.tar.gz" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Data preprocessing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"# Read Dataset\n", | |
"import numpy as np\n", | |
"import pickle\n", | |
"\n", | |
"img_codes = np.load(\"data/image_codes.npy\")\n", | |
"captions = pickle.load(open('data/caption_tokens.pcl', 'rb'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"print \"each image code is a 1000-unit vector:\", img_codes.shape\n", | |
"print img_codes[0,:10]\n", | |
"print '\\n\\n'\n", | |
"print \"for each image there are 5-7 descriptions, e.g.:\\n\"\n", | |
"print '\\n'.join(captions[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#split descriptions into tokens\n", | |
"for img_i in range(len(captions)):\n", | |
" for caption_i in range(len(captions[img_i])):\n", | |
" sentence = captions[img_i][caption_i] \n", | |
" captions[img_i][caption_i] = [\"#START#\"]+sentence.split(' ')+[\"#END#\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Build a Vocabulary\n", | |
"\n", | |
"############# TO CODE IT BY YOURSELF ##################\n", | |
"word_counts = <here should be dict word:number of entrances>\n", | |
"\n", | |
"vocab = ['#UNK#', '#START#', '#END#']\n", | |
"vocab += [k for k, v in word_counts.items() if v >= 5]\n", | |
"n_tokens = len(vocab)\n", | |
"\n", | |
"assert 10000 <= n_tokens <= 10500\n", | |
"\n", | |
"word_to_index = {w: i for i, w in enumerate(vocab)}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"PAD_ix = -1\n", | |
"UNK_ix = vocab.index('#UNK#')\n", | |
"\n", | |
"def as_matrix(sequences,max_len=None):\n", | |
" max_len = max_len or max(map(len,sequences))\n", | |
" \n", | |
" matrix = np.zeros((len(sequences),max_len),dtype='int32')+PAD_ix\n", | |
" for i,seq in enumerate(sequences):\n", | |
" row_ix = [word_to_index.get(word,UNK_ix) for word in seq[:max_len]]\n", | |
" matrix[i,:len(row_ix)] = row_ix\n", | |
" \n", | |
" return matrix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#try it out on several descriptions of a random image\n", | |
"as_matrix(captions[1337])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### My Neural Network" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# network shapes. \n", | |
"CNN_FEATURE_SIZE = img_codes.shape[1]\n", | |
"EMBED_SIZE = 128 #pls change me if u want\n", | |
"LSTM_UNITS = 200 #pls change me if u want" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import theano\n", | |
"import lasagne\n", | |
"import theano.tensor as T\n", | |
"from lasagne.layers import *" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Input Variable\n", | |
"sentences = T.imatrix()# [batch_size x time] of word ids\n", | |
"image_vectors = T.matrix() # [batch size x unit] of CNN image features\n", | |
"sentence_mask = T.neq(sentences, PAD_ix)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#network inputs\n", | |
"l_words = InputLayer((None, None), sentences)\n", | |
"l_mask = InputLayer((None, None), sentence_mask)\n", | |
"\n", | |
"#embeddings for words \n", | |
"############# TO CODE IT BY YOURSELF ##################\n", | |
"l_word_embeddings = <Embedding Layer>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# input layer for image features\n", | |
"l_image_features = InputLayer((None, CNN_FEATURE_SIZE), image_vectors)\n", | |
"\n", | |
"############# TO CODE IT BY YOURSELF ##################\n", | |
"#convert 1000 image features from googlenet to whatever LSTM_UNITS you have set\n", | |
"#it's also a good idea to add some dropout here and there\n", | |
"l_image_features_small = <Apply Dropout to regularise your Net>\n", | |
"l_image_features_small = <Apply linear to get LSTM_UNITS size representation>\n", | |
"assert l_image_features_small.shape == (None, LSTM_UNITS)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"############# TO CODE IT BY YOURSELF ##################\n", | |
"# Concatinate image features and word embedings in one sequence \n", | |
"decoder = LSTMLayer(<What should be here?>,\n", | |
" num_units=LSTM_UNITS,\n", | |
" cell_init=<Use your brain =)>,\n", | |
" mask_input=<Mask?>,\n", | |
" grad_clipping=<boom grads>)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Decoding of rnn hiden states\n", | |
"from broadcast import BroadcastLayer,UnbroadcastLayer\n", | |
"\n", | |
"#apply whatever comes next to each tick of each example in a batch. Equivalent to 2 reshapes\n", | |
"broadcast_decoder_ticks = BroadcastLayer(decoder, (0, 1))\n", | |
"print \"broadcasted decoder shape = \", broadcast_decoder_ticks.output_shape\n", | |
"\n", | |
"predicted_probabilities_each_tick = DenseLayer(broadcast_decoder_ticks,\n", | |
" n_tokens, \n", | |
" nonlinearity=lasagne.nonlinearities.softmax)\n", | |
"\n", | |
"#un-broadcast back into (batch,tick,probabilities)\n", | |
"predicted_probabilities = UnbroadcastLayer(predicted_probabilities_each_tick, \n", | |
" broadcast_layer=broadcast_decoder_ticks)\n", | |
"\n", | |
"print \"output shape = \", predicted_probabilities.output_shape\n", | |
"\n", | |
"#remove if you know what you're doing (e.g. 1d convolutions or fixed shape)\n", | |
"assert predicted_probabilities.output_shape == (None, None, 10373)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"next_word_probas = get_output(predicted_probabilities)\n", | |
"\n", | |
"reference_answers = sentences[:,1:]\n", | |
"output_mask = sentence_mask[:,1:]\n", | |
"\n", | |
"#write symbolic loss function to train NN for\n", | |
"loss = lasagne.objectives.categorical_crossentropy(\n", | |
" next_word_probas[:, :-1].reshape((-1, n_tokens)),\n", | |
" reference_answers.reshape((-1,))\n", | |
").reshape(reference_answers.shape)\n", | |
"\n", | |
"############# TO CODE IT BY YOURSELF ##################\n", | |
"loss = <mean over non-PAD tokens>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#trainable NN weights\n", | |
"############# TO CODE IT BY YOURSELF ##################\n", | |
"weights = <all dnn weigts>\n", | |
"updates = <your favorite optimizer>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#compile a function that takes input sentence and image mask, outputs loss and updates weights\n", | |
"#please not that your functions must accept image features as FIRST param and sentences as second one\n", | |
"############# TO CODE IT BY YOURSELF ##################\n", | |
"train_step = <>\n", | |
"val_step = <>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false | |
}, | |
"source": [ | |
"# Training\n", | |
"\n", | |
"* You first have to implement a batch generator\n", | |
"* Than the network will get trained the usual way" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"captions = np.array(captions)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from random import choice\n", | |
"\n", | |
"def generate_batch(images,captions,batch_size,max_caption_len=None):\n", | |
" #sample random numbers for image/caption indicies\n", | |
" random_image_ix = np.random.randint(0, len(images), size=batch_size)\n", | |
" \n", | |
" #get images\n", | |
" batch_images = images[random_image_ix]\n", | |
" \n", | |
" #5-7 captions for each image\n", | |
" captions_for_batch_images = captions[random_image_ix]\n", | |
" \n", | |
" #pick 1 from 5-7 captions for each image\n", | |
" batch_captions = map(choice, captions_for_batch_images)\n", | |
" \n", | |
" #convert to matrix\n", | |
" batch_captions_ix = as_matrix(batch_captions,max_len=max_caption_len)\n", | |
" \n", | |
" return batch_images, batch_captions_ix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"generate_batch(img_codes,captions, 3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Main loop\n", | |
"* We recommend you to periodically evaluate the network using the next \"apply trained model\" block\n", | |
" * its safe to interrupt training, run a few examples and start training again" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"batch_size = 50 #adjust me\n", | |
"n_epochs = 100 #adjust me\n", | |
"n_batches_per_epoch = 50 #adjust me\n", | |
"n_validation_batches = 5 #how many batches are used for validation after each epoch" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from tqdm import tqdm\n", | |
"\n", | |
"for epoch in range(n_epochs):\n", | |
" train_loss=0\n", | |
" for _ in tqdm(range(n_batches_per_epoch)):\n", | |
" train_loss += train_step(*generate_batch(img_codes,captions,batch_size))\n", | |
" train_loss /= n_batches_per_epoch\n", | |
" \n", | |
" val_loss=0\n", | |
" for _ in range(n_validation_batches):\n", | |
" val_loss += val_step(*generate_batch(img_codes,captions,batch_size))\n", | |
" val_loss /= n_validation_batches\n", | |
" \n", | |
" print('\\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))\n", | |
"\n", | |
"print(\"Finish :)\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false | |
}, | |
"source": [ | |
"### apply trained model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#the same kind you did last week, but a bit smaller\n", | |
"from pretrained_lenet import build_model,preprocess,MEAN_VALUES\n", | |
"\n", | |
"# build googlenet\n", | |
"lenet = build_model()\n", | |
"\n", | |
"#load weights\n", | |
"lenet_weights = pickle.load(open('data/blvc_googlenet.pkl'))['param values']\n", | |
"set_all_param_values(lenet[\"prob\"], lenet_weights)\n", | |
"\n", | |
"#compile get_features\n", | |
"cnn_input_var = lenet['input'].input_var\n", | |
"cnn_feature_layer = lenet['loss3/classifier']\n", | |
"get_cnn_features = theano.function([cnn_input_var], lasagne.layers.get_output(cnn_feature_layer))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from matplotlib import pyplot as plt\n", | |
"%matplotlib inline\n", | |
"\n", | |
"#sample image\n", | |
"img = plt.imread('data/Dog-and-Cat.jpg')\n", | |
"img = preprocess(img)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#deprocess and show, one line :)\n", | |
"from pretrained_lenet import MEAN_VALUES\n", | |
"plt.imshow(np.transpose((img[0] + MEAN_VALUES)[::-1],[1,2,0]).astype('uint8'))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Generate caption" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"last_word_probas_det = get_output(predicted_probabilities,deterministic=False)[:,-1]\n", | |
"\n", | |
"get_probs = theano.function([image_vectors,sentences], last_word_probas_det)\n", | |
"\n", | |
"#this is exactly the generation function from week5 classwork,\n", | |
"#except now we condition on image features instead of words\n", | |
"def generate_caption(image,caption_prefix = (\"START\",),t=1,sample=True,max_len=100):\n", | |
" image_features = get_cnn_features(image)\n", | |
" caption = list(caption_prefix)\n", | |
" for _ in range(max_len):\n", | |
" \n", | |
" next_word_probs = get_probs(image_features,as_matrix([caption]) ).ravel()\n", | |
" #apply temperature\n", | |
" next_word_probs = next_word_probs**t / np.sum(next_word_probs**t)\n", | |
"\n", | |
" if sample:\n", | |
" next_word = np.random.choice(vocab,p=next_word_probs) \n", | |
" else:\n", | |
" next_word = vocab[np.argmax(next_word_probs)]\n", | |
"\n", | |
" caption.append(next_word)\n", | |
"\n", | |
" if next_word==\"#END#\":\n", | |
" break\n", | |
" \n", | |
" return caption" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"for i in range(10):\n", | |
" print ' '.join(generate_caption(img,t=1.)[1:-1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Bonus Part\n", | |
"- Use ResNet Instead of GoogLeNet\n", | |
"- Use W2V as embedding\n", | |
"- Use Attention" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda env:python2]", | |
"language": "python", | |
"name": "conda-env-python2-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment