Skip to content

Instantly share code, notes, and snippets.

@akash-ch2812
Created July 23, 2020 15:38
Show Gist options
  • Save akash-ch2812/9d607ee2bf1c0d0a842ee893328194c7 to your computer and use it in GitHub Desktop.
Save akash-ch2812/9d607ee2bf1c0d0a842ee893328194c7 to your computer and use it in GitHub Desktop.
# loading captions from captions file
import pandas as pd
# loading captions.txt
captions = pd.read_csv('/kaggle/input/flickr8k/captions.txt', sep=",")
captions = captions.rename(columns=lambda x: x.strip().lower())
captions['image'] = captions['image'].apply(lambda x: x.split(".")[0])
captions = captions[['image', 'caption']]
# adding <start> and <end> to every caption
captions['caption'] = "<start> " + captions['caption'] + " <end>"
# in case we have any missing caption/blank caption drop it
print(captions.shape)
captions = captions.dropna()
print(captions.shape)
# training and testing image captions split
train_image_captions = {}
test_image_captions = {}
# list for storing every caption
all_captions = []
# storing training data
for image in train_data_images:
tempDf = captions[captions['image'] == image]
list_of_captions = tempDf['caption'].tolist()
train_image_captions[image] = list_of_captions
all_captions.append(list_of_captions)
# store testing data
for image in test_data_images:
tempDf = captions[captions['image'] == image]
list_of_captions = tempDf['caption'].tolist()
test_image_captions[image] = list_of_captions
all_captions.append(list_of_captions)
print("Data Statistics")
print(f"Training Images Captions {len(train_image_captions.keys())}")
print(f"Testing Images Captions {len(test_image_captions.keys())}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment