akash-ch2812/Tokenizer.py

## Tokenizer.py
import spacy
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

# tokenize evry captions, remove punctuations, lowercase everything
for key, value in train_image_captions.items():
    ls = []
    for v in value:
        doc = nlp(v)
        new_v = " "
        for token in doc:
            if not token.is_punct:
                if token.text not in [" ", "\n", "\n\n"]:
                    new_v = new_v + " " + token.text.lower()

        new_v = new_v.strip()
        ls.append(new_v)
    train_image_captions[key] = ls


# create a vocabulary of all the unique words present in captions
# flatten the list
all_captions = [caption for list_of_captions in all_captions for caption in list_of_captions]

# use spacy to convert to lowercase and reject any special characters
tokens = []
for captions in all_captions:
    doc = nlp(captions)
    for token in doc:
        if not token.is_punct:
            if token.text not in [" ", "\n", "\n\n"]:
                tokens.append(token.text.lower())

# get tokens with frequency less than 10
import collections
word_count_dict = collections.Counter(tokens)
reject_words = []
for key, value in word_count_dict.items():
    if value < 10:
        reject_words.append(key)

reject_words.append("<")
reject_words.append(">")

 # remove tokens that are in reject words
tokens = [x for x in tokens if x not in reject_words]

# convert the token to equivalent index using Tokenizer class of Keras
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
	import spacy
	nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

	# tokenize evry captions, remove punctuations, lowercase everything
	for key, value in train_image_captions.items():
	ls = []
	for v in value:
	doc = nlp(v)
	new_v = " "
	for token in doc:
	if not token.is_punct:
	if token.text not in [" ", "\n", "\n\n"]:
	new_v = new_v + " " + token.text.lower()

	new_v = new_v.strip()
	ls.append(new_v)
	train_image_captions[key] = ls


	# create a vocabulary of all the unique words present in captions
	# flatten the list
	all_captions = [caption for list_of_captions in all_captions for caption in list_of_captions]

	# use spacy to convert to lowercase and reject any special characters
	tokens = []
	for captions in all_captions:
	doc = nlp(captions)
	for token in doc:
	if not token.is_punct:
	if token.text not in [" ", "\n", "\n\n"]:
	tokens.append(token.text.lower())

	# get tokens with frequency less than 10
	import collections
	word_count_dict = collections.Counter(tokens)
	reject_words = []
	for key, value in word_count_dict.items():
	if value < 10:
	reject_words.append(key)

	reject_words.append("<")
	reject_words.append(">")

	# remove tokens that are in reject words
	tokens = [x for x in tokens if x not in reject_words]

	# convert the token to equivalent index using Tokenizer class of Keras
	from keras.preprocessing.text import Tokenizer
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(tokens)