Skip to content

Instantly share code, notes, and snippets.

@Shreyz-max
Last active March 15, 2021 11:17
Show Gist options
  • Save Shreyz-max/f3fb0509bc2b0bfe42cac9f951c1483d to your computer and use it in GitHub Desktop.
Save Shreyz-max/f3fb0509bc2b0bfe42cac9f951c1483d to your computer and use it in GitHub Desktop.
cleaning captions
train_path='training_data'
TRAIN_LABEL_PATH = os.path.join(train_path, 'training_label.json')
# mentioning the train test split
train_split = 0.85
# loading the json file for training
with open(TRAIN_LABEL_PATH) as data_file:
y_data = json.load(data_file)
# train_list contains all the captions with their video ID
# vocab_list contains all the vocabulary from training data
train_list = []
vocab_list = []
for y in y_data:
for caption in y['caption']:
caption = "<bos> " + caption + " <eos>"
# we are only using sentences whose length lie between 6 and 10
if len(caption.split())>10 or len(caption.split())<6:
continue
else:
train_list.append([caption, y['id']])
print(len(train_list))
random.shuffle(train_list)
training_list = train_list[:int(len(train_list)*train_split)]
validation_list = train_list[int(len(train_list)*train_split):]
for train in training_list:
vocab_list.append(train[0])
# Tokenizing the words
tokenizer = Tokenizer(num_words=1500)
tokenizer.fit_on_texts(vocab_list)
x_data = {}
TRAIN_FEATURE_DIR = os.path.join('training_data', 'feat')
# Loading all the numpy arrays at once and saving them in a dictionary
for filename in os.listdir(TRAIN_FEATURE_DIR):
f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename))
x_data[filename[:-4]] = f
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment