Gaurav Bansal gauravbansal98

## asd
encoder_model = Encoder()
decoder_model = CaptionModel(vocab_size).to(device)
decoder_model.load_state_dict(torch.load(args.checkpoint))
for image_name in os.listdir("evaluate/images"):
    image = load_image(image_name, size=224)
    # convert the image pixels to a numpy array
    image = transforms.ToTensor()(image)
    # reshape data for the model
    image = image.unsqueeze(0)
    # prepare the image for the VGG model

## eva
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
	actual, predicted = list(), list()
	# step over the whole set
	for key, desc_list in descriptions.items():
		# generate description
		yhat = generate_desc(model, tokenizer, photos[key], max_length)
		# store actual and predicted
		references = [d.split() for d in desc_list]
		actual.append(references)
		predicted.append(yhat.split())

## evaluate
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
	# seed the generation process

## training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = CaptionModel(vocab_size).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

if args.checkpoint != None:
    print("Loading the checkpoint")
    model.load_state_dict(torch.load(args.checkpoint))

print("Number of epochs ", args.num_epochs)

## Caption Model
class CaptionModel(nn.Module):

    def __init__(self, vocab_size):
        super(CaptionModel, self).__init__()

        # Input from the encoder decoder
        self.feature_extractor = nn.Sequential()
        self.feature_extractor.add_module('dropout', nn.Dropout(0.5))
        self.feature_extractor.add_module('FC', nn.Linear(in_features=4096, out_features=256))
        self.feature_extractor.add_module('activation', nn.ReLU())

## create_sequences
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
	X1, X2, y = list(), list(), list()
	# walk through each image identifier
	for key, desc_list in descriptions.items():
		# walk through each description for the image
		for desc in desc_list:
			# encode the sequence
			seq = tokenizer.texts_to_sequences([desc])[0]
			# split one sequence into multiple X,y pairs

## xyz
X1,		X2 (text sequence), 						y (word)
photo	startseq, 									little
photo	startseq, little,							girl
photo	startseq, little, girl, 					running
photo	startseq, little, girl, running, 			in
photo	startseq, little, girl, running, in, 		field
photo	startseq, little, girl, running, in, field, endseq

## xyz
X1,		X2 (text sequence), 						              y (word)
photo	startseq, 									                  little
photo	startseq, little,							                girl
photo	startseq, little, girl, 					            running
photo	startseq, little, girl, running, 			        in
photo	startseq, little, girl, running, in, 		      field
photo	startseq, little, girl, running, in, field,   endseq

## tokenizer
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)

## load_photo_features
# load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features
	encoder_model = Encoder()
	decoder_model = CaptionModel(vocab_size).to(device)
	decoder_model.load_state_dict(torch.load(args.checkpoint))
	for image_name in os.listdir("evaluate/images"):
	image = load_image(image_name, size=224)
	# convert the image pixels to a numpy array
	image = transforms.ToTensor()(image)
	# reshape data for the model
	image = image.unsqueeze(0)
	# prepare the image for the VGG model
	def evaluate_model(model, descriptions, photos, tokenizer, max_length):
	actual, predicted = list(), list()
	# step over the whole set
	for key, desc_list in descriptions.items():
	# generate description
	yhat = generate_desc(model, tokenizer, photos[key], max_length)
	# store actual and predicted
	references = [d.split() for d in desc_list]
	actual.append(references)
	predicted.append(yhat.split())
	# map an integer to a word
	def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
	if index == integer:
	return word
	return None

	# generate a description for an image
	def generate_desc(model, tokenizer, photo, max_length):
	# seed the generation process
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	model = CaptionModel(vocab_size).to(device)
	loss_fn = nn.CrossEntropyLoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

	if args.checkpoint != None:
	print("Loading the checkpoint")
	model.load_state_dict(torch.load(args.checkpoint))

	print("Number of epochs ", args.num_epochs)
	class CaptionModel(nn.Module):

	def __init__(self, vocab_size):
	super(CaptionModel, self).__init__()

	# Input from the encoder decoder
	self.feature_extractor = nn.Sequential()
	self.feature_extractor.add_module('dropout', nn.Dropout(0.5))
	self.feature_extractor.add_module('FC', nn.Linear(in_features=4096, out_features=256))
	self.feature_extractor.add_module('activation', nn.ReLU())
	# create sequences of images, input sequences and output words for an image
	def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
	X1, X2, y = list(), list(), list()
	# walk through each image identifier
	for key, desc_list in descriptions.items():
	# walk through each description for the image
	for desc in desc_list:
	# encode the sequence
	seq = tokenizer.texts_to_sequences([desc])[0]
	# split one sequence into multiple X,y pairs
	X1, X2 (text sequence), y (word)
	photo startseq, little
	photo startseq, little, girl
	photo startseq, little, girl, running
	photo startseq, little, girl, running, in
	photo startseq, little, girl, running, in, field
	photo startseq, little, girl, running, in, field, endseq
	# convert a dictionary of clean descriptions to a list of descriptions
	def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
	[all_desc.append(d) for d in descriptions[key]]
	return all_desc

	# fit a tokenizer given caption descriptions
	def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	# load photo features
	def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features