Last active
July 24, 2020 05:45
-
-
Save akash-ch2812/3a4f7435aa21f665287eeab89cd8cd41 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) | |
# tokenize evry captions, remove punctuations, lowercase everything | |
for key, value in train_image_captions.items(): | |
ls = [] | |
for v in value: | |
doc = nlp(v) | |
new_v = " " | |
for token in doc: | |
if not token.is_punct: | |
if token.text not in [" ", "\n", "\n\n"]: | |
new_v = new_v + " " + token.text.lower() | |
new_v = new_v.strip() | |
ls.append(new_v) | |
train_image_captions[key] = ls | |
# create a vocabulary of all the unique words present in captions | |
# flatten the list | |
all_captions = [caption for list_of_captions in all_captions for caption in list_of_captions] | |
# use spacy to convert to lowercase and reject any special characters | |
tokens = [] | |
for captions in all_captions: | |
doc = nlp(captions) | |
for token in doc: | |
if not token.is_punct: | |
if token.text not in [" ", "\n", "\n\n"]: | |
tokens.append(token.text.lower()) | |
# get tokens with frequency less than 10 | |
import collections | |
word_count_dict = collections.Counter(tokens) | |
reject_words = [] | |
for key, value in word_count_dict.items(): | |
if value < 10: | |
reject_words.append(key) | |
reject_words.append("<") | |
reject_words.append(">") | |
# remove tokens that are in reject words | |
tokens = [x for x in tokens if x not in reject_words] | |
# convert the token to equivalent index using Tokenizer class of Keras | |
from keras.preprocessing.text import Tokenizer | |
tokenizer = Tokenizer() | |
tokenizer.fit_on_texts(tokens) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment