Skip to content

Instantly share code, notes, and snippets.

@akash-ch2812
Last active July 24, 2020 05:45
Show Gist options
  • Save akash-ch2812/3a4f7435aa21f665287eeab89cd8cd41 to your computer and use it in GitHub Desktop.
Save akash-ch2812/3a4f7435aa21f665287eeab89cd8cd41 to your computer and use it in GitHub Desktop.
import spacy
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
# tokenize evry captions, remove punctuations, lowercase everything
for key, value in train_image_captions.items():
ls = []
for v in value:
doc = nlp(v)
new_v = " "
for token in doc:
if not token.is_punct:
if token.text not in [" ", "\n", "\n\n"]:
new_v = new_v + " " + token.text.lower()
new_v = new_v.strip()
ls.append(new_v)
train_image_captions[key] = ls
# create a vocabulary of all the unique words present in captions
# flatten the list
all_captions = [caption for list_of_captions in all_captions for caption in list_of_captions]
# use spacy to convert to lowercase and reject any special characters
tokens = []
for captions in all_captions:
doc = nlp(captions)
for token in doc:
if not token.is_punct:
if token.text not in [" ", "\n", "\n\n"]:
tokens.append(token.text.lower())
# get tokens with frequency less than 10
import collections
word_count_dict = collections.Counter(tokens)
reject_words = []
for key, value in word_count_dict.items():
if value < 10:
reject_words.append(key)
reject_words.append("<")
reject_words.append(">")
# remove tokens that are in reject words
tokens = [x for x in tokens if x not in reject_words]
# convert the token to equivalent index using Tokenizer class of Keras
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment