Shreyz-max/clean_captions.py

## clean_captions.py
train_path='training_data'
TRAIN_LABEL_PATH = os.path.join(train_path, 'training_label.json')
# mentioning the train test split
train_split = 0.85
# loading the json file for training
with open(TRAIN_LABEL_PATH) as data_file:
    y_data = json.load(data_file)
# train_list contains all the captions with their video ID
# vocab_list contains all the vocabulary from training data
train_list = []
vocab_list = []
for y in y_data:
  for caption in y['caption']:
    caption = "<bos> " + caption + " <eos>"
    # we are only using sentences whose length lie between 6 and 10
    if len(caption.split())>10 or len(caption.split())<6:
      continue
    else:
      train_list.append([caption, y['id']])
print(len(train_list))
random.shuffle(train_list)
training_list = train_list[:int(len(train_list)*train_split)]
validation_list = train_list[int(len(train_list)*train_split):]
for train in training_list:
    vocab_list.append(train[0])
# Tokenizing the words
tokenizer = Tokenizer(num_words=1500)
tokenizer.fit_on_texts(vocab_list)
x_data = {}
TRAIN_FEATURE_DIR = os.path.join('training_data', 'feat')
# Loading all the numpy arrays at once and saving them in a dictionary
for filename in os.listdir(TRAIN_FEATURE_DIR):
    f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename))
    x_data[filename[:-4]] = f
	train_path='training_data'
	TRAIN_LABEL_PATH = os.path.join(train_path, 'training_label.json')
	# mentioning the train test split
	train_split = 0.85
	# loading the json file for training
	with open(TRAIN_LABEL_PATH) as data_file:
	y_data = json.load(data_file)
	# train_list contains all the captions with their video ID
	# vocab_list contains all the vocabulary from training data
	train_list = []
	vocab_list = []
	for y in y_data:
	for caption in y['caption']:
	caption = "<bos> " + caption + " <eos>"
	# we are only using sentences whose length lie between 6 and 10
	if len(caption.split())>10 or len(caption.split())<6:
	continue
	else:
	train_list.append([caption, y['id']])
	print(len(train_list))
	random.shuffle(train_list)
	training_list = train_list[:int(len(train_list)*train_split)]
	validation_list = train_list[int(len(train_list)*train_split):]
	for train in training_list:
	vocab_list.append(train[0])
	# Tokenizing the words
	tokenizer = Tokenizer(num_words=1500)
	tokenizer.fit_on_texts(vocab_list)
	x_data = {}
	TRAIN_FEATURE_DIR = os.path.join('training_data', 'feat')
	# Loading all the numpy arrays at once and saving them in a dictionary
	for filename in os.listdir(TRAIN_FEATURE_DIR):
	f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename))
	x_data[filename[:-4]] = f