Skip to content

Instantly share code, notes, and snippets.

@Nov05
Last active Mar 11, 2020
Embed
What would you like to do?
2020-03-07 CNN-LSTM image captioning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
# super(EncoderCNN, self).__init__()
super().__init__()
resnet = models.resnet50(pretrained=True)
for param in resnet.parameters():
param.requires_grad_(False)
# remove the top fully connected layer
modules = list(resnet.children())[:-1]
self.resnet = nn.Sequential(*modules)
self.embed = nn.Linear(resnet.fc.in_features, embed_size)
def forward(self, images):
features = self.resnet(images)
features = features.view(features.size(0), -1)
features = self.embed(features) # [batch size, embed size]
return features
class DecoderRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1,
p_dropout=0.1):
super().__init__()
self.hidden_size = hidden_size
# The decoder will embed the inputs before feeding them to the LSTM.
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=embed_size,
# padding_idx=dictionary.pad(),
)
self.dropout = nn.Dropout(p=p_dropout)
self.lstm = nn.LSTM(
# For the first layer we'll concatenate the Encoder's final hidden
# state with the embedded target tokens.
input_size=embed_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True)
self.linear = nn.Linear(hidden_size, vocab_size)
# initialize the hidden state (see code below)
self.hidden = self.init_hidden()
def init_hidden(self):
''' At the start of training, we need to initialize a hidden state;
there will be none because the hidden state is formed based on perviously seen data.
So, this function defines a hidden state with all random numbers and of a specified size.'''
# The axes dimensions are [n_layers, batch_size, hidden_size]
return (torch.randn(1, 1, self.hidden_size),
torch.randn(1, 1, self.hidden_size))
def forward(self, features, captions):
# shape of features: [batch_size, embed_size], e.g. [10, 256]
# shape of captions: [batch_size, sequence_size], e.g. [10, 20]
# Embed the target sequence, which has been shifted right by one
# position and now starts with the image feature vector.
# shape of caption embedded: [10, 20, 256]
embedded = self.embedding(captions)
embedded = self.dropout(embedded)
lstm_input = torch.cat((features.unsqueeze(1), embedded), dim=1)
lstm_input = lstm_input[:, :-1, :] # remove the last token in the sequence
# Get the output and hidden state by passing the lstm over our word embeddings
# the lstm takes in our embeddings and hidden state.
# LSTM input shape: [batch_size, sequence_size, input_size], e.g. [10, 20, 256]
# LSTM output shape: [batch_size, sequence_size, hidden_size], e.g. [10, 20, 512]
lstm_output, _ = self.lstm(lstm_input) # LSTM output, hidden state
# shape of output: [batch_size, sequence_size, vocab_size], e.g.[10, 20, 8856]
output = self.linear(lstm_output)
output = F.log_softmax(output, dim=2)
return output
def sample(self, features, states=None, max_len=20):
'''accepts pre-processed image tensor (features) and returns
predicted sentence (list of tensor ids of length max_len)'''
# Inference: There are multiple approaches that can be used
# to generate a sentence given an image, with NIC. The first
# one is Sampling where we just sample the first word according
# to p1, then provide the corresponding embedding
# as input and sample p2, continuing like this until we sample
# the special end-of-sentence token or some maximum length.
# https://arxiv.org/pdf/1411.4555.pdf
# shape of features: torch.Size([1, 256])
# shape of word_embedding [1, 1, 256]
lstm_input = features.unsqueeze(1)
idxs = []
for _ in range(max_len):
lstm_output, states = self.lstm(lstm_input, states)
output = self.linear(lstm_output)
_, idx = torch.max(output[0][0], 0)
idxs.append(idx.item())
# embedding input shape [batch_size, sequence_size]
# embedding output shape [batch_size, sequence_size, embed_size]
lstm_input = self.embedding(idx.unsqueeze(0).unsqueeze(0))
return idxs
def beam_search(self, features, states=None, max_len=20, k=20):
'''generate sequence with length=max_len from features'''
# The second one is【BeamSearch】: iteratively consider the set
# of the k best sentences up to time t as candidates to generate
# sentences of size t + 1, and keep only the resulting best k
# of them. This better approximates S = arg maxS′ p(S′|I).
# We used the BeamSearch approach in the following experi-
# ments, with a beam of size 20. Using a beam size of 1 (i.e.,
# greedy search) did degrade our results by 2 BLEU points on
# average. https://arxiv.org/pdf/1411.4555.pdf
topk = [[[], .0, None]] # [sequence, score, key_states]
states_prev, states_curr = {}, {}
lstm_input = features.unsqueeze(1)
for _ in range(max_len):
candidates = []
for i, (seq, score, key_states) in enumerate(topk):
# get decoder output
if seq:
lstm_input = self.embedding(seq[-1].unsqueeze(0).unsqueeze(0))
states = states_prev[key_states]
lstm_output, states = self.lstm(lstm_input, states)
# store hidden states
states_curr[i] = states
# get token probalities
output = self.linear(lstm_output)
output = F.log_softmax(output, dim=2)
output = output[0][0]
# calculate scores
for (idx, val) in enumerate(output):
candidate = [seq+[torch.tensor(idx).to(output.device)], score+val.item(), i]
candidates.append(candidate)
# update hidden states dictionary
states_prev, states_curr = states_curr, {}
# order all candidates by score, select k-best
topk = sorted(candidates, key=lambda x:x[1], reverse=True)[:k]
return [idx.item() for idx in topk[0][0]]
@Nov05
Copy link
Author

Nov05 commented Mar 7, 2020

Sequential Inputs

So, an LSTM looks at inputs sequentially. In PyTorch, there are two ways to do this.

The first is pretty intuitive: for all the inputs in a sequence, which in this case would be a feature from an image, a start word, the next word, the next word, and so on (until the end of a sequence/batch), you loop through each input like so:

for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

The second approach, which this project uses, is to give the LSTM our entire sequence and have it produce a set of outputs and the last hidden state:

# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state

# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)

@Nov05
Copy link
Author

Nov05 commented Mar 7, 2020

Show and Tell: A Neural Image Caption Generator
https://arxiv.org/pdf/1411.4555.pdf

【Inference】There are multiple approaches that can be used
to generate a sentence given an image, with NIC. The first
one is 【Sampling】where we just sample the first word ac-
cording to p1, then provide the corresponding embedding
as input and sample p2, continuing like this until we sample
the special end-of-sentence token or some maximum length.
The second one is【BeamSearch】: iteratively consider the set
of the k best sentences up to time t as candidates to generate
sentences of size t + 1, and keep only the resulting best k
of them. This better approximates S = arg maxS′ p(S′|I).
We used the BeamSearch approach in the following experi-
ments, with a beam of size 20. Using a beam size of 1 (i.e.,
greedy search) did degrade our results by 2 BLEU points on
average.

@Nov05
Copy link
Author

Nov05 commented Mar 7, 2020

Reviews

1. The chosen CNN architecture in the CNNEncoder class in model.py makes sense as an encoder for the image captioning task.

Awesome!! The CNN architecture for the CNN encoder looks good, you made a wise move to use the resnet pre-trained model. 🎉
Suggestion – have a look at this link (https://cs224d.stanford.edu/reports/msoh.pdf) to read more on CNN-LSTM architecture used for image captioning. 😄

2. The chosen RNN architecture in the RNNDecoder class in model.py makes sense as a decoder for the image captioning task.

Awesome again indeed!! Good job on the RNN decoder architecture, the embeddings layer is necessary for the captions. Great job on implementing that correctly! 😄 🎁
Suggestion – Have a look at this link (https://kapilddatascience.wordpress.com/2017/01/07/image-captioning-using-rnn-and-lstm/) to know more about how RNN and LSTM are used in image captioning. 😄

@Nov05
Copy link
Author

Nov05 commented Mar 8, 2020

How to Implement a Beam Search Decoder for Natural Language Processing
by Jason Brownlee on January 5, 2018
Last Updated on August 7, 2019
https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/

CAUTION:

  1. Log probabilities are supposed to be added rather than being multiplied.
  2. The beam search implementation in this post doesn't store hidden states, hence it is not suitable for RNN decoder. (Notebook)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment