Skip to content

Instantly share code, notes, and snippets.

encoder_model = Encoder()
decoder_model = CaptionModel(vocab_size).to(device)
decoder_model.load_state_dict(torch.load(args.checkpoint))
for image_name in os.listdir("evaluate/images"):
image = load_image(image_name, size=224)
# convert the image pixels to a numpy array
image = transforms.ToTensor()(image)
# reshape data for the model
image = image.unsqueeze(0)
# prepare the image for the VGG model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
actual, predicted = list(), list()
# step over the whole set
for key, desc_list in descriptions.items():
# generate description
yhat = generate_desc(model, tokenizer, photos[key], max_length)
# store actual and predicted
references = [d.split() for d in desc_list]
actual.append(references)
predicted.append(yhat.split())
# map an integer to a word
def word_for_id(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
# seed the generation process
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = CaptionModel(vocab_size).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
if args.checkpoint != None:
print("Loading the checkpoint")
model.load_state_dict(torch.load(args.checkpoint))
print("Number of epochs ", args.num_epochs)
class CaptionModel(nn.Module):
def __init__(self, vocab_size):
super(CaptionModel, self).__init__()
# Input from the encoder decoder
self.feature_extractor = nn.Sequential()
self.feature_extractor.add_module('dropout', nn.Dropout(0.5))
self.feature_extractor.add_module('FC', nn.Linear(in_features=4096, out_features=256))
self.feature_extractor.add_module('activation', nn.ReLU())
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
X1, X2, y = list(), list(), list()
# walk through each image identifier
for key, desc_list in descriptions.items():
# walk through each description for the image
for desc in desc_list:
# encode the sequence
seq = tokenizer.texts_to_sequences([desc])[0]
# split one sequence into multiple X,y pairs
X1, X2 (text sequence), y (word)
photo startseq, little
photo startseq, little, girl
photo startseq, little, girl, running
photo startseq, little, girl, running, in
photo startseq, little, girl, running, in, field
photo startseq, little, girl, running, in, field, endseq
X1, X2 (text sequence), y (word)
photo startseq, little
photo startseq, little, girl
photo startseq, little, girl, running
photo startseq, little, girl, running, in
photo startseq, little, girl, running, in, field
photo startseq, little, girl, running, in, field, endseq
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
all_desc = list()
for key in descriptions.keys():
[all_desc.append(d) for d in descriptions[key]]
return all_desc
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
lines = to_lines(descriptions)
# load photo features
def load_photo_features(filename, dataset):
# load all features
all_features = load(open(filename, 'rb'))
# filter features
features = {k: all_features[k] for k in dataset}
return features