Skip to content

Instantly share code, notes, and snippets.

@rosterloh
Last active June 6, 2018 20:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rosterloh/380f7a6ce0e8989e7cac57ca59dfc24e to your computer and use it in GitHub Desktop.
Save rosterloh/380f7a6ce0e8989e7cac57ca59dfc24e to your computer and use it in GitHub Desktop.
CV-ND Project 2
import sys
sys.path.append('/opt/cocoapi/PythonAPI')
from pycocotools.coco import COCO
import nltk
nltk.download('punkt')
from data_loader import get_loader
from torchvision import transforms
import torch
from collections import Counter
import numpy as np
import torch.utils.data as data
from model import EncoderCNN, DecoderRNN
# Define a transform to pre-process the training images.
transform_train = transforms.Compose([
transforms.Resize(256), # smaller edge of image resized to 256
transforms.RandomCrop(224), # get 224x224 crop from random location
transforms.RandomHorizontalFlip(), # horizontally flip image with probability=0.5
transforms.ToTensor(), # convert the PIL Image to a tensor
transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model
(0.229, 0.224, 0.225))])
# Set the minimum word count threshold.
vocab_threshold = 4
# Specify the batch size.
batch_size = 10
# Obtain the data loader.
data_loader = get_loader(transform=transform_train,
mode='train',
batch_size=batch_size,
vocab_threshold=vocab_threshold,
vocab_from_file=True) # False if tweaking threshold
# Print the total number of keys in the word2idx dictionary.
print('Total number of tokens in vocabulary:', len(data_loader.dataset.vocab))
# Tally the total number of training captions with each length.
# counter = Counter(data_loader.dataset.caption_lengths)
# lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
# for value, count in lengths:
# print('value: %2d --- count: %5d' % (value, count))
# Randomly sample a caption length, and sample indices with that length.
indices = data_loader.dataset.get_train_indices()
print('sampled indices:', indices)
# Create and assign a batch sampler to retrieve a batch with the sampled indices.
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler
# Obtain the batch.
images, captions = next(iter(data_loader))
print('images.shape:', images.shape)
print('captions.shape:', captions.shape)
# (Optional) Uncomment the lines of code below to print the pre-processed images and captions.
# print('images:', images)
# print('captions:', captions)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Running on', device)
# Specify the dimensionality of the image embedding.
embed_size = 256
# Initialize the encoder. (Optional: Add additional arguments if necessary.)
encoder = EncoderCNN(embed_size)
# Move the encoder to GPU if CUDA is available.
encoder.to(device)
# Move last batch of images (from Step 2) to GPU if CUDA is available.
images = images.to(device)
# Pass the images through the encoder.
features = encoder(images)
print('type(features):', type(features))
print('features.shape:', features.shape)
print('isCUDA:', features.is_cuda)
# Check that your encoder satisfies some requirements of the project! :D
assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor."
assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."
# Specify the number of features in the hidden state of the RNN decoder.
hidden_size = 512
# Store the size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)
# Initialize the decoder.
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
# Move the decoder to GPU if CUDA is available.
decoder.to(device)
# Move last batch of captions (from Step 1) to GPU if CUDA is available
captions = captions.to(device)
# Pass the encoder output and captions through the decoder.
outputs = decoder(features, captions)
print('type(outputs):', type(outputs))
print('outputs.shape:', outputs.shape)
# Check that your decoder satisfies some requirements of the project! :D
assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
"""Load the pretrained ResNet-152 and replace top fc layer."""
super(EncoderCNN, self).__init__()
resnet = models.resnet152(pretrained=True)
modules = list(resnet.children())[:-1] # delete the last fc layer.
self.resnet = nn.Sequential(*modules)
self.linear = nn.Linear(resnet.fc.in_features, embed_size)
self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
def forward(self, images):
"""Extract feature vectors from input images."""
with torch.no_grad():
features = self.resnet(images)
features = features.reshape(features.size(0), -1)
features = self.bn(self.linear(features))
return features
class DecoderRNN(nn.Module):
r"""
Provides functionality for decoding in https://arxiv.org/pdf/1411.4555.pdf
Args:
embed_size (int): The size of each embedding vector
hidden_size (int): The number of features in the hidden state of the RNN decoder
vocab_size (int): The size of the dictionary of embeddings
num_layers (int): The number of recurrent layers in the LSTM
Inputs:
features (batch_size, embed_size): Tensor containing the encoder output
captions (batch_size, decoder_hidden): Tensor containing input captions
Outputs:
decoder_outputs ():
decoder_hidden ():
ret_dict ():
"""
def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embed = nn.Embedding(vocab_size, embed_size)
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_size.
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
# The linear layer that maps from hidden state space to tag space
self.linear = nn.Linear(hidden_size, vocab_size)
self.softmax = nn.Softmax()
def forward(self, features, captions):
"""Decode image feature vectors and generates captions."""
batch_size = features.size(0)
lengths = [len(cap) for cap in captions]
# remove the <end> token from the caption
captions = captions[:,:-1]
embeddings = self.embed(captions)
embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
hiddens, _ = self.lstm(embeddings)
out = self.linear(hiddens)
return out
def sample(self, inputs, states=None, max_len=20):
" accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
pass
/home/rosterloh/miniconda3/envs/cv-nd/bin/python /home/rosterloh/cv-nd/P2_Image_Captioning/analyse.py
[nltk_data] Downloading package punkt to /home/rosterloh/nltk_data...
[nltk_data] Package punkt is already up-to-date!
Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.54s)
creating index...
index created!
Obtaining caption lengths...
100%|██████████| 414113/414113 [00:38<00:00, 10650.50it/s]
Total number of tokens in vocabulary: 9956
sampled indices: [199955, 76317, 13471, 289884, 182100, 101662, 386165, 85410, 384008, 80888]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 15])
Running on cuda
type(features): <class 'torch.Tensor'>
features.shape: torch.Size([10, 256])
isCUDA: True
Embeddings: torch.Size([10, 15, 256])
LSTM Output: torch.Size([10, 16, 512])
Linear Output: torch.Size([10, 16, 9956])
type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 16, 9956])
Traceback (most recent call last):
File "/home/rosterloh/cv-nd/P2_Image_Captioning/analyse.py", line 112, in <module>
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."
AssertionError: The shape of the decoder output is incorrect.
Process finished with exit code 1
Embeddings: torch.Size([10, 11, 256])
LSTM Output: torch.Size([10, 12, 512])
Linear Output: torch.Size([10, 12, 9955])
type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 12, 9955])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment