Skip to content

Instantly share code, notes, and snippets.

@cheeyeo
Forked from jhihn/tokenizer_serialization.py
Created April 13, 2018 19:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cheeyeo/783222b2c6bfc738ccdb003b5071c271 to your computer and use it in GitHub Desktop.
Save cheeyeo/783222b2c6bfc738ccdb003b5071c271 to your computer and use it in GitHub Desktop.
Keras Tokenizer Gist
# Keras tokenizer lacks serialization. Therefore I created the below to address this without changing the API.
# (Since I don't know how long it'll take for keras to support it)
# The Tokenizer __init__ should be modified to take the word_stats dictionary as a kwarg,
# and a method added to the class to return the stats
# Expiermentally this works, but I am not sure of any nuances in the Tokenizer class.
def test_tokenizer():
texts = ["It was the best of times, it was the worst of times, it was the age of wisdom",
"it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, ",
"it was the season of Light, it was the season of Darkness, it was the spring of hope, ",
"it was the winter of despair, we had nothing before us, we were all going direct to Heaven, ",
"we were all going direct the other way"]
tokenizer = Tokenizer()
#tokenizer.word_counts = dict()
tokenizer.fit_on_texts(texts)
print 'tokenizer.word_index', tokenizer.word_index
print 'tokenizer.word_counts',tokenizer.word_counts
hack_tokenizer(tokenizer)
encoded_docs1 = tokenizer.texts_to_sequences(texts)
word_stats1 = export_tokenizer(tokenizer)
word_stats2 = json.loads(json.dumps(word_stats1, sort_keys=True)) # Key order can be sorted or unsorted
#print 'word_stats1',word_stats1
#print 'word_stats2',word_stats2
tokenizer2 = restore_tokenizer(word_stats2)
encoded_docs2 = tokenizer2.texts_to_sequences(texts)
print 'encoded_docs1'
for doc in encoded_docs1:
print '\t', doc
print 'encoded_docs2'
for doc in encoded_docs2:
print '\t', doc
print encoded_docs1 == encoded_docs2
def export_tokenizer(tokenizer):
return {'word_counts': tokenizer.word_counts, 'word_docs': tokenizer.word_counts}
def hack_tokenizer(tokenizer): #Show that OrderedDict is not needed, if x[0] is included in the lambda
tokenizer.word_counts = dict(tokenizer.word_counts) # Bye OrderedDict!
wcounts = list(tokenizer.word_counts.items())
wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True)
sorted_voc = [wc[0] for wc in wcounts]
# note that index 0 is reserved, never assigned to an existing word
tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1)))
print 'tokenizer.word_index', tokenizer.word_index
if tokenizer.oov_token is not None:
i = tokenizer.word_index.get(self.oov_token)
if i is None:
tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1
tokenizer.index_docs = {}
for w, c in list(tokenizer.word_docs.items()):
tokenizer.index_docs[tokenizer.word_index[w]] = c
def restore_tokenizer(word_stats):
tokenizer = Tokenizer()
tokenizer.word_counts = word_stats['word_counts']
tokenizer.word_docs = word_stats['word_docs']
tokenizer.document_count = len(word_stats['word_docs'])
# from here on, this is taken from the original fit_on_texts(), except as noted
wcounts = list(tokenizer.word_counts.items())
wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True) # except for including x[0]
sorted_voc = [wc[0] for wc in wcounts]
# note that index 0 is reserved, never assigned to an existing word
tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1))) # and except superfulous list()s removed
if tokenizer.oov_token is not None:
i = tokenizer.word_index.get(self.oov_token)
if i is None:
tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1
tokenizer.index_docs = {}
for w, c in list(tokenizer.word_docs.items()):
tokenizer.index_docs[tokenizer.word_index[w]] = c
return tokenizer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment