Skip to content

Instantly share code, notes, and snippets.

@madrugado
Last active May 16, 2017 18:23
Show Gist options
  • Save madrugado/046220bfdad45b5fcef5dbb41e56ddb3 to your computer and use it in GitHub Desktop.
Save madrugado/046220bfdad45b5fcef5dbb41e56ddb3 to your computer and use it in GitHub Desktop.
Image Captioning Homework
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h1 align=\"center\"> Image Captioning </h1> \n",
"\n",
"To begin with, let us download the dataset of image features from a pre-trained GoogleNet."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!wget https://www.dropbox.com/s/d50pqlm19c6f6w5/data.tar.gz?dl=0 -O data.tar.gz\n",
"!tar -xvzf data.tar.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data preprocessing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%time\n",
"# Read Dataset\n",
"import numpy as np\n",
"import pickle\n",
"\n",
"img_codes = np.load(\"data/image_codes.npy\")\n",
"captions = pickle.load(open('data/caption_tokens.pcl', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print \"each image code is a 1000-unit vector:\", img_codes.shape\n",
"print img_codes[0,:10]\n",
"print '\\n\\n'\n",
"print \"for each image there are 5-7 descriptions, e.g.:\\n\"\n",
"print '\\n'.join(captions[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#split descriptions into tokens\n",
"for img_i in range(len(captions)):\n",
" for caption_i in range(len(captions[img_i])):\n",
" sentence = captions[img_i][caption_i] \n",
" captions[img_i][caption_i] = [\"#START#\"]+sentence.split(' ')+[\"#END#\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Build a Vocabulary\n",
"\n",
"############# TO CODE IT BY YOURSELF ##################\n",
"word_counts = <here should be dict word:number of entrances>\n",
"\n",
"vocab = ['#UNK#', '#START#', '#END#']\n",
"vocab += [k for k, v in word_counts.items() if v >= 5]\n",
"n_tokens = len(vocab)\n",
"\n",
"assert 10000 <= n_tokens <= 10500\n",
"\n",
"word_to_index = {w: i for i, w in enumerate(vocab)}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"PAD_ix = -1\n",
"UNK_ix = vocab.index('#UNK#')\n",
"\n",
"def as_matrix(sequences,max_len=None):\n",
" max_len = max_len or max(map(len,sequences))\n",
" \n",
" matrix = np.zeros((len(sequences),max_len),dtype='int32')+PAD_ix\n",
" for i,seq in enumerate(sequences):\n",
" row_ix = [word_to_index.get(word,UNK_ix) for word in seq[:max_len]]\n",
" matrix[i,:len(row_ix)] = row_ix\n",
" \n",
" return matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#try it out on several descriptions of a random image\n",
"as_matrix(captions[1337])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### My Neural Network"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# network shapes. \n",
"CNN_FEATURE_SIZE = img_codes.shape[1]\n",
"EMBED_SIZE = 128 #pls change me if u want\n",
"LSTM_UNITS = 200 #pls change me if u want"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import theano\n",
"import lasagne\n",
"import theano.tensor as T\n",
"from lasagne.layers import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Input Variable\n",
"sentences = T.imatrix()# [batch_size x time] of word ids\n",
"image_vectors = T.matrix() # [batch size x unit] of CNN image features\n",
"sentence_mask = T.neq(sentences, PAD_ix)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#network inputs\n",
"l_words = InputLayer((None, None), sentences)\n",
"l_mask = InputLayer((None, None), sentence_mask)\n",
"\n",
"#embeddings for words \n",
"############# TO CODE IT BY YOURSELF ##################\n",
"l_word_embeddings = <Embedding Layer>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# input layer for image features\n",
"l_image_features = InputLayer((None, CNN_FEATURE_SIZE), image_vectors)\n",
"\n",
"############# TO CODE IT BY YOURSELF ##################\n",
"#convert 1000 image features from googlenet to whatever LSTM_UNITS you have set\n",
"#it's also a good idea to add some dropout here and there\n",
"l_image_features_small = <Apply Dropout to regularise your Net>\n",
"l_image_features_small = <Apply linear to get LSTM_UNITS size representation>\n",
"assert l_image_features_small.shape == (None, LSTM_UNITS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"############# TO CODE IT BY YOURSELF ##################\n",
"# Concatinate image features and word embedings in one sequence \n",
"decoder = LSTMLayer(<What should be here?>,\n",
" num_units=LSTM_UNITS,\n",
" cell_init=<Use your brain =)>,\n",
" mask_input=<Mask?>,\n",
" grad_clipping=<boom grads>)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Decoding of rnn hiden states\n",
"from broadcast import BroadcastLayer,UnbroadcastLayer\n",
"\n",
"#apply whatever comes next to each tick of each example in a batch. Equivalent to 2 reshapes\n",
"broadcast_decoder_ticks = BroadcastLayer(decoder, (0, 1))\n",
"print \"broadcasted decoder shape = \", broadcast_decoder_ticks.output_shape\n",
"\n",
"predicted_probabilities_each_tick = DenseLayer(broadcast_decoder_ticks,\n",
" n_tokens, \n",
" nonlinearity=lasagne.nonlinearities.softmax)\n",
"\n",
"#un-broadcast back into (batch,tick,probabilities)\n",
"predicted_probabilities = UnbroadcastLayer(predicted_probabilities_each_tick, \n",
" broadcast_layer=broadcast_decoder_ticks)\n",
"\n",
"print \"output shape = \", predicted_probabilities.output_shape\n",
"\n",
"#remove if you know what you're doing (e.g. 1d convolutions or fixed shape)\n",
"assert predicted_probabilities.output_shape == (None, None, 10373)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"next_word_probas = get_output(predicted_probabilities)\n",
"\n",
"reference_answers = sentences[:,1:]\n",
"output_mask = sentence_mask[:,1:]\n",
"\n",
"#write symbolic loss function to train NN for\n",
"loss = lasagne.objectives.categorical_crossentropy(\n",
" next_word_probas[:, :-1].reshape((-1, n_tokens)),\n",
" reference_answers.reshape((-1,))\n",
").reshape(reference_answers.shape)\n",
"\n",
"############# TO CODE IT BY YOURSELF ##################\n",
"loss = <mean over non-PAD tokens>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#trainable NN weights\n",
"############# TO CODE IT BY YOURSELF ##################\n",
"weights = <all dnn weigts>\n",
"updates = <your favorite optimizer>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#compile a function that takes input sentence and image mask, outputs loss and updates weights\n",
"#please not that your functions must accept image features as FIRST param and sentences as second one\n",
"############# TO CODE IT BY YOURSELF ##################\n",
"train_step = <>\n",
"val_step = <>"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# Training\n",
"\n",
"* You first have to implement a batch generator\n",
"* Than the network will get trained the usual way"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"captions = np.array(captions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from random import choice\n",
"\n",
"def generate_batch(images,captions,batch_size,max_caption_len=None):\n",
" #sample random numbers for image/caption indicies\n",
" random_image_ix = np.random.randint(0, len(images), size=batch_size)\n",
" \n",
" #get images\n",
" batch_images = images[random_image_ix]\n",
" \n",
" #5-7 captions for each image\n",
" captions_for_batch_images = captions[random_image_ix]\n",
" \n",
" #pick 1 from 5-7 captions for each image\n",
" batch_captions = map(choice, captions_for_batch_images)\n",
" \n",
" #convert to matrix\n",
" batch_captions_ix = as_matrix(batch_captions,max_len=max_caption_len)\n",
" \n",
" return batch_images, batch_captions_ix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"generate_batch(img_codes,captions, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Main loop\n",
"* We recommend you to periodically evaluate the network using the next \"apply trained model\" block\n",
" * its safe to interrupt training, run a few examples and start training again"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"batch_size = 50 #adjust me\n",
"n_epochs = 100 #adjust me\n",
"n_batches_per_epoch = 50 #adjust me\n",
"n_validation_batches = 5 #how many batches are used for validation after each epoch"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"from tqdm import tqdm\n",
"\n",
"for epoch in range(n_epochs):\n",
" train_loss=0\n",
" for _ in tqdm(range(n_batches_per_epoch)):\n",
" train_loss += train_step(*generate_batch(img_codes,captions,batch_size))\n",
" train_loss /= n_batches_per_epoch\n",
" \n",
" val_loss=0\n",
" for _ in range(n_validation_batches):\n",
" val_loss += val_step(*generate_batch(img_codes,captions,batch_size))\n",
" val_loss /= n_validation_batches\n",
" \n",
" print('\\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))\n",
"\n",
"print(\"Finish :)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"### apply trained model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#the same kind you did last week, but a bit smaller\n",
"from pretrained_lenet import build_model,preprocess,MEAN_VALUES\n",
"\n",
"# build googlenet\n",
"lenet = build_model()\n",
"\n",
"#load weights\n",
"lenet_weights = pickle.load(open('data/blvc_googlenet.pkl'))['param values']\n",
"set_all_param_values(lenet[\"prob\"], lenet_weights)\n",
"\n",
"#compile get_features\n",
"cnn_input_var = lenet['input'].input_var\n",
"cnn_feature_layer = lenet['loss3/classifier']\n",
"get_cnn_features = theano.function([cnn_input_var], lasagne.layers.get_output(cnn_feature_layer))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from matplotlib import pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"#sample image\n",
"img = plt.imread('data/Dog-and-Cat.jpg')\n",
"img = preprocess(img)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#deprocess and show, one line :)\n",
"from pretrained_lenet import MEAN_VALUES\n",
"plt.imshow(np.transpose((img[0] + MEAN_VALUES)[::-1],[1,2,0]).astype('uint8'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate caption"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"last_word_probas_det = get_output(predicted_probabilities,deterministic=False)[:,-1]\n",
"\n",
"get_probs = theano.function([image_vectors,sentences], last_word_probas_det)\n",
"\n",
"#this is exactly the generation function from week5 classwork,\n",
"#except now we condition on image features instead of words\n",
"def generate_caption(image,caption_prefix = (\"START\",),t=1,sample=True,max_len=100):\n",
" image_features = get_cnn_features(image)\n",
" caption = list(caption_prefix)\n",
" for _ in range(max_len):\n",
" \n",
" next_word_probs = get_probs(image_features,as_matrix([caption]) ).ravel()\n",
" #apply temperature\n",
" next_word_probs = next_word_probs**t / np.sum(next_word_probs**t)\n",
"\n",
" if sample:\n",
" next_word = np.random.choice(vocab,p=next_word_probs) \n",
" else:\n",
" next_word = vocab[np.argmax(next_word_probs)]\n",
"\n",
" caption.append(next_word)\n",
"\n",
" if next_word==\"#END#\":\n",
" break\n",
" \n",
" return caption"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for i in range(10):\n",
" print ' '.join(generate_caption(img,t=1.)[1:-1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Bonus Part\n",
"- Use ResNet Instead of GoogLeNet\n",
"- Use W2V as embedding\n",
"- Use Attention"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:python2]",
"language": "python",
"name": "conda-env-python2-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment