rosterloh/analyse.py

## analyse.py
import sys
sys.path.append('/opt/cocoapi/PythonAPI')
from pycocotools.coco import COCO
import nltk
nltk.download('punkt')
from data_loader import get_loader
from torchvision import transforms
import torch
from collections import Counter
import numpy as np
import torch.utils.data as data
from model import EncoderCNN, DecoderRNN

# Define a transform to pre-process the training images.
transform_train = transforms.Compose([
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Set the minimum word count threshold.
vocab_threshold = 4

# Specify the batch size.
batch_size = 10

# Obtain the data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=True)  # False if tweaking threshold

# Print the total number of keys in the word2idx dictionary.
print('Total number of tokens in vocabulary:', len(data_loader.dataset.vocab))

# Tally the total number of training captions with each length.
# counter = Counter(data_loader.dataset.caption_lengths)
# lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
# for value, count in lengths:
#     print('value: %2d --- count: %5d' % (value, count))

# Randomly sample a caption length, and sample indices with that length.
indices = data_loader.dataset.get_train_indices()
print('sampled indices:', indices)

# Create and assign a batch sampler to retrieve a batch with the sampled indices.
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler

# Obtain the batch.
images, captions = next(iter(data_loader))

print('images.shape:', images.shape)
print('captions.shape:', captions.shape)

# (Optional) Uncomment the lines of code below to print the pre-processed images and captions.
# print('images:', images)
# print('captions:', captions)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Running on', device)

# Specify the dimensionality of the image embedding.
embed_size = 256

# Initialize the encoder. (Optional: Add additional arguments if necessary.)
encoder = EncoderCNN(embed_size)

# Move the encoder to GPU if CUDA is available.
encoder.to(device)

# Move last batch of images (from Step 2) to GPU if CUDA is available.
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print('type(features):', type(features))
print('features.shape:', features.shape)
print('isCUDA:', features.is_cuda)

# Check that your encoder satisfies some requirements of the project! :D
assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor."
assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."

# Specify the number of features in the hidden state of the RNN decoder.
hidden_size = 512

# Store the size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the decoder.
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move the decoder to GPU if CUDA is available.
decoder.to(device)

# Move last batch of captions (from Step 1) to GPU if CUDA is available
captions = captions.to(device)

# Pass the encoder output and captions through the decoder.
outputs = decoder(features, captions)

print('type(outputs):', type(outputs))
print('outputs.shape:', outputs.shape)

# Check that your decoder satisfies some requirements of the project! :D
assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."

## model.py
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]  # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features


class DecoderRNN(nn.Module):
    r"""
    Provides functionality for decoding in https://arxiv.org/pdf/1411.4555.pdf

    Args:
        embed_size (int): The size of each embedding vector
        hidden_size (int): The number of features in the hidden state of the RNN decoder
        vocab_size (int): The size of the dictionary of embeddings
        num_layers (int): The number of recurrent layers in the LSTM

    Inputs:
        features (batch_size, embed_size): Tensor containing the encoder output
        captions (batch_size, decoder_hidden): Tensor containing input captions

    Outputs:
        decoder_outputs ():
        decoder_hidden ():
        ret_dict ():

    """
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embed = nn.Embedding(vocab_size, embed_size)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_size.
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.Softmax()

    def forward(self, features, captions):
        """Decode image feature vectors and generates captions."""
        batch_size = features.size(0)
        lengths = [len(cap) for cap in captions]
        # remove the <end> token from the caption
        captions = captions[:,:-1]
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        hiddens, _ = self.lstm(embeddings)
        out = self.linear(hiddens)
        return out

    def sample(self, inputs, states=None, max_len=20):
        " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
        pass

## output.txt
/home/rosterloh/miniconda3/envs/cv-nd/bin/python /home/rosterloh/cv-nd/P2_Image_Captioning/analyse.py
[nltk_data] Downloading package punkt to /home/rosterloh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.54s)
creating index...
index created!
Obtaining caption lengths...
100%|██████████| 414113/414113 [00:38<00:00, 10650.50it/s]
Total number of tokens in vocabulary: 9956
sampled indices: [199955, 76317, 13471, 289884, 182100, 101662, 386165, 85410, 384008, 80888]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 15])
Running on cuda
type(features): <class 'torch.Tensor'>
features.shape: torch.Size([10, 256])
isCUDA: True
Embeddings: torch.Size([10, 15, 256])
LSTM Output: torch.Size([10, 16, 512])
Linear Output: torch.Size([10, 16, 9956])
type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 16, 9956])
Traceback (most recent call last):
  File "/home/rosterloh/cv-nd/P2_Image_Captioning/analyse.py", line 112, in <module>
    assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."
AssertionError: The shape of the decoder output is incorrect.

Process finished with exit code 1

Embeddings: torch.Size([10, 11, 256])
LSTM Output: torch.Size([10, 12, 512])
Linear Output: torch.Size([10, 12, 9955])
type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 12, 9955])
	import sys
	sys.path.append('/opt/cocoapi/PythonAPI')
	from pycocotools.coco import COCO
	import nltk
	nltk.download('punkt')
	from data_loader import get_loader
	from torchvision import transforms
	import torch
	from collections import Counter
	import numpy as np
	import torch.utils.data as data
	from model import EncoderCNN, DecoderRNN

	# Define a transform to pre-process the training images.
	transform_train = transforms.Compose([
	transforms.Resize(256), # smaller edge of image resized to 256
	transforms.RandomCrop(224), # get 224x224 crop from random location
	transforms.RandomHorizontalFlip(), # horizontally flip image with probability=0.5
	transforms.ToTensor(), # convert the PIL Image to a tensor
	transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model
	(0.229, 0.224, 0.225))])

	# Set the minimum word count threshold.
	vocab_threshold = 4

	# Specify the batch size.
	batch_size = 10

	# Obtain the data loader.
	data_loader = get_loader(transform=transform_train,
	mode='train',
	batch_size=batch_size,
	vocab_threshold=vocab_threshold,
	vocab_from_file=True) # False if tweaking threshold

	# Print the total number of keys in the word2idx dictionary.
	print('Total number of tokens in vocabulary:', len(data_loader.dataset.vocab))

	# Tally the total number of training captions with each length.
	# counter = Counter(data_loader.dataset.caption_lengths)
	# lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
	# for value, count in lengths:
	# print('value: %2d --- count: %5d' % (value, count))

	# Randomly sample a caption length, and sample indices with that length.
	indices = data_loader.dataset.get_train_indices()
	print('sampled indices:', indices)

	# Create and assign a batch sampler to retrieve a batch with the sampled indices.
	new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
	data_loader.batch_sampler.sampler = new_sampler

	# Obtain the batch.
	images, captions = next(iter(data_loader))

	print('images.shape:', images.shape)
	print('captions.shape:', captions.shape)

	# (Optional) Uncomment the lines of code below to print the pre-processed images and captions.
	# print('images:', images)
	# print('captions:', captions)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print('Running on', device)

	# Specify the dimensionality of the image embedding.
	embed_size = 256

	# Initialize the encoder. (Optional: Add additional arguments if necessary.)
	encoder = EncoderCNN(embed_size)

	# Move the encoder to GPU if CUDA is available.
	encoder.to(device)

	# Move last batch of images (from Step 2) to GPU if CUDA is available.
	images = images.to(device)

	# Pass the images through the encoder.
	features = encoder(images)

	print('type(features):', type(features))
	print('features.shape:', features.shape)
	print('isCUDA:', features.is_cuda)

	# Check that your encoder satisfies some requirements of the project! :D
	assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor."
	assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."

	# Specify the number of features in the hidden state of the RNN decoder.
	hidden_size = 512

	# Store the size of the vocabulary.
	vocab_size = len(data_loader.dataset.vocab)

	# Initialize the decoder.
	decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

	# Move the decoder to GPU if CUDA is available.
	decoder.to(device)

	# Move last batch of captions (from Step 1) to GPU if CUDA is available
	captions = captions.to(device)

	# Pass the encoder output and captions through the decoder.
	outputs = decoder(features, captions)

	print('type(outputs):', type(outputs))
	print('outputs.shape:', outputs.shape)

	# Check that your decoder satisfies some requirements of the project! :D
	assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
	assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."
	import torch
	import torch.nn as nn
	import torchvision.models as models
	from torch.nn.utils.rnn import pack_padded_sequence


	class EncoderCNN(nn.Module):
	def __init__(self, embed_size):
	"""Load the pretrained ResNet-152 and replace top fc layer."""
	super(EncoderCNN, self).__init__()
	resnet = models.resnet152(pretrained=True)
	modules = list(resnet.children())[:-1] # delete the last fc layer.
	self.resnet = nn.Sequential(*modules)
	self.linear = nn.Linear(resnet.fc.in_features, embed_size)
	self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

	def forward(self, images):
	"""Extract feature vectors from input images."""
	with torch.no_grad():
	features = self.resnet(images)
	features = features.reshape(features.size(0), -1)
	features = self.bn(self.linear(features))
	return features


	class DecoderRNN(nn.Module):
	r"""
	Provides functionality for decoding in https://arxiv.org/pdf/1411.4555.pdf

	Args:
	embed_size (int): The size of each embedding vector
	hidden_size (int): The number of features in the hidden state of the RNN decoder
	vocab_size (int): The size of the dictionary of embeddings
	num_layers (int): The number of recurrent layers in the LSTM

	Inputs:
	features (batch_size, embed_size): Tensor containing the encoder output
	captions (batch_size, decoder_hidden): Tensor containing input captions

	Outputs:
	decoder_outputs ():
	decoder_hidden ():
	ret_dict ():

	"""
	def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
	super(DecoderRNN, self).__init__()
	self.hidden_size = hidden_size

	self.embed = nn.Embedding(vocab_size, embed_size)

	# The LSTM takes word embeddings as inputs, and outputs hidden states
	# with dimensionality hidden_size.
	self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

	# The linear layer that maps from hidden state space to tag space
	self.linear = nn.Linear(hidden_size, vocab_size)
	self.softmax = nn.Softmax()

	def forward(self, features, captions):
	"""Decode image feature vectors and generates captions."""
	batch_size = features.size(0)
	lengths = [len(cap) for cap in captions]
	# remove the <end> token from the caption
	captions = captions[:,:-1]
	embeddings = self.embed(captions)
	embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
	hiddens, _ = self.lstm(embeddings)
	out = self.linear(hiddens)
	return out

	def sample(self, inputs, states=None, max_len=20):
	" accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
	pass
	/home/rosterloh/miniconda3/envs/cv-nd/bin/python /home/rosterloh/cv-nd/P2_Image_Captioning/analyse.py
	[nltk_data] Downloading package punkt to /home/rosterloh/nltk_data...
	[nltk_data] Package punkt is already up-to-date!
	Vocabulary successfully loaded from vocab.pkl file!
	loading annotations into memory...
	Done (t=0.54s)
	creating index...
	index created!
	Obtaining caption lengths...
	100%\|██████████\| 414113/414113 [00:38<00:00, 10650.50it/s]
	Total number of tokens in vocabulary: 9956
	sampled indices: [199955, 76317, 13471, 289884, 182100, 101662, 386165, 85410, 384008, 80888]
	images.shape: torch.Size([10, 3, 224, 224])
	captions.shape: torch.Size([10, 15])
	Running on cuda
	type(features): <class 'torch.Tensor'>
	features.shape: torch.Size([10, 256])
	isCUDA: True
	Embeddings: torch.Size([10, 15, 256])
	LSTM Output: torch.Size([10, 16, 512])
	Linear Output: torch.Size([10, 16, 9956])
	type(outputs): <class 'torch.Tensor'>
	outputs.shape: torch.Size([10, 16, 9956])
	Traceback (most recent call last):
	File "/home/rosterloh/cv-nd/P2_Image_Captioning/analyse.py", line 112, in <module>
	assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."
	AssertionError: The shape of the decoder output is incorrect.

	Process finished with exit code 1

	Embeddings: torch.Size([10, 11, 256])
	LSTM Output: torch.Size([10, 12, 512])
	Linear Output: torch.Size([10, 12, 9955])
	type(outputs): <class 'torch.Tensor'>
	outputs.shape: torch.Size([10, 12, 9955])