Created
July 23, 2020 15:38
-
-
Save akash-ch2812/9d607ee2bf1c0d0a842ee893328194c7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# loading captions from captions file | |
import pandas as pd | |
# loading captions.txt | |
captions = pd.read_csv('/kaggle/input/flickr8k/captions.txt', sep=",") | |
captions = captions.rename(columns=lambda x: x.strip().lower()) | |
captions['image'] = captions['image'].apply(lambda x: x.split(".")[0]) | |
captions = captions[['image', 'caption']] | |
# adding <start> and <end> to every caption | |
captions['caption'] = "<start> " + captions['caption'] + " <end>" | |
# in case we have any missing caption/blank caption drop it | |
print(captions.shape) | |
captions = captions.dropna() | |
print(captions.shape) | |
# training and testing image captions split | |
train_image_captions = {} | |
test_image_captions = {} | |
# list for storing every caption | |
all_captions = [] | |
# storing training data | |
for image in train_data_images: | |
tempDf = captions[captions['image'] == image] | |
list_of_captions = tempDf['caption'].tolist() | |
train_image_captions[image] = list_of_captions | |
all_captions.append(list_of_captions) | |
# store testing data | |
for image in test_data_images: | |
tempDf = captions[captions['image'] == image] | |
list_of_captions = tempDf['caption'].tolist() | |
test_image_captions[image] = list_of_captions | |
all_captions.append(list_of_captions) | |
print("Data Statistics") | |
print(f"Training Images Captions {len(train_image_captions.keys())}") | |
print(f"Testing Images Captions {len(test_image_captions.keys())}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment