cheeyeo/tokenizer_serialization.py

## tokenizer_serialization.py
# Keras tokenizer lacks serialization. Therefore I created the below to address this without changing the API.
# (Since I don't know how long it'll take for keras to support it)
# The Tokenizer __init__ should be modified to take the word_stats dictionary as a kwarg,
# and a method added to the class to return the stats
# Expiermentally this works, but I am not sure of any nuances in the Tokenizer class.

def test_tokenizer():
	texts = ["It was the best of times, it was the worst of times, it was the age of wisdom",
      "it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, ",
      "it was the season of Light, it was the season of Darkness, it was the spring of hope, ",
      "it was the winter of despair, we had nothing before us, we were all going direct to Heaven, ",
      "we were all going direct the other way"]

	tokenizer = Tokenizer()
	#tokenizer.word_counts = dict()
	tokenizer.fit_on_texts(texts)
	print 'tokenizer.word_index', tokenizer.word_index

	print 'tokenizer.word_counts',tokenizer.word_counts
	hack_tokenizer(tokenizer)
	encoded_docs1 = tokenizer.texts_to_sequences(texts)
	word_stats1 = export_tokenizer(tokenizer)
	word_stats2 = json.loads(json.dumps(word_stats1, sort_keys=True)) # Key order can be sorted or unsorted
	#print 'word_stats1',word_stats1
	#print 'word_stats2',word_stats2
	tokenizer2 = restore_tokenizer(word_stats2)
	encoded_docs2 = tokenizer2.texts_to_sequences(texts)
	print 'encoded_docs1'
	for doc in encoded_docs1:
		print '\t', doc
	print 'encoded_docs2'
	for doc in encoded_docs2:
		print '\t', doc

	print encoded_docs1 == encoded_docs2

def export_tokenizer(tokenizer):
	return {'word_counts': tokenizer.word_counts, 'word_docs': tokenizer.word_counts}

def hack_tokenizer(tokenizer): #Show that OrderedDict is not needed, if x[0] is included in the lambda
	tokenizer.word_counts = dict(tokenizer.word_counts) # Bye OrderedDict!
	wcounts = list(tokenizer.word_counts.items())
	wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True)
	sorted_voc = [wc[0] for wc in wcounts]
	# note that index 0 is reserved, never assigned to an existing word
	tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1)))
	print 'tokenizer.word_index', tokenizer.word_index

	if tokenizer.oov_token is not None:
		i = tokenizer.word_index.get(self.oov_token)
		if i is None:
			tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1

	tokenizer.index_docs = {}
	for w, c in list(tokenizer.word_docs.items()):
		tokenizer.index_docs[tokenizer.word_index[w]] = c


def restore_tokenizer(word_stats):
	tokenizer = Tokenizer()
	tokenizer.word_counts = word_stats['word_counts']
	tokenizer.word_docs = word_stats['word_docs']
	tokenizer.document_count = len(word_stats['word_docs'])

	# from here on, this is taken from the original fit_on_texts(), except as noted
	wcounts = list(tokenizer.word_counts.items())
	wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True) # except for including x[0]
	sorted_voc = [wc[0] for wc in wcounts]
	# note that index 0 is reserved, never assigned to an existing word
	tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1))) # and except superfulous list()s removed

	if tokenizer.oov_token is not None:
		i = tokenizer.word_index.get(self.oov_token)
		if i is None:
			tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1

	tokenizer.index_docs = {}
	for w, c in list(tokenizer.word_docs.items()):
		tokenizer.index_docs[tokenizer.word_index[w]] = c

	return tokenizer
	# Keras tokenizer lacks serialization. Therefore I created the below to address this without changing the API.
	# (Since I don't know how long it'll take for keras to support it)
	# The Tokenizer __init__ should be modified to take the word_stats dictionary as a kwarg,
	# and a method added to the class to return the stats
	# Expiermentally this works, but I am not sure of any nuances in the Tokenizer class.

	def test_tokenizer():
	texts = ["It was the best of times, it was the worst of times, it was the age of wisdom",
	"it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, ",
	"it was the season of Light, it was the season of Darkness, it was the spring of hope, ",
	"it was the winter of despair, we had nothing before us, we were all going direct to Heaven, ",
	"we were all going direct the other way"]

	tokenizer = Tokenizer()
	#tokenizer.word_counts = dict()
	tokenizer.fit_on_texts(texts)
	print 'tokenizer.word_index', tokenizer.word_index

	print 'tokenizer.word_counts',tokenizer.word_counts
	hack_tokenizer(tokenizer)
	encoded_docs1 = tokenizer.texts_to_sequences(texts)
	word_stats1 = export_tokenizer(tokenizer)
	word_stats2 = json.loads(json.dumps(word_stats1, sort_keys=True)) # Key order can be sorted or unsorted
	#print 'word_stats1',word_stats1
	#print 'word_stats2',word_stats2
	tokenizer2 = restore_tokenizer(word_stats2)
	encoded_docs2 = tokenizer2.texts_to_sequences(texts)
	print 'encoded_docs1'
	for doc in encoded_docs1:
	print '\t', doc
	print 'encoded_docs2'
	for doc in encoded_docs2:
	print '\t', doc

	print encoded_docs1 == encoded_docs2

	def export_tokenizer(tokenizer):
	return {'word_counts': tokenizer.word_counts, 'word_docs': tokenizer.word_counts}

	def hack_tokenizer(tokenizer): #Show that OrderedDict is not needed, if x[0] is included in the lambda
	tokenizer.word_counts = dict(tokenizer.word_counts) # Bye OrderedDict!
	wcounts = list(tokenizer.word_counts.items())
	wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True)
	sorted_voc = [wc[0] for wc in wcounts]
	# note that index 0 is reserved, never assigned to an existing word
	tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1)))
	print 'tokenizer.word_index', tokenizer.word_index

	if tokenizer.oov_token is not None:
	i = tokenizer.word_index.get(self.oov_token)
	if i is None:
	tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1

	tokenizer.index_docs = {}
	for w, c in list(tokenizer.word_docs.items()):
	tokenizer.index_docs[tokenizer.word_index[w]] = c


	def restore_tokenizer(word_stats):
	tokenizer = Tokenizer()
	tokenizer.word_counts = word_stats['word_counts']
	tokenizer.word_docs = word_stats['word_docs']
	tokenizer.document_count = len(word_stats['word_docs'])

	# from here on, this is taken from the original fit_on_texts(), except as noted
	wcounts = list(tokenizer.word_counts.items())
	wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True) # except for including x[0]
	sorted_voc = [wc[0] for wc in wcounts]
	# note that index 0 is reserved, never assigned to an existing word
	tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1))) # and except superfulous list()s removed

	if tokenizer.oov_token is not None:
	i = tokenizer.word_index.get(self.oov_token)
	if i is None:
	tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1

	tokenizer.index_docs = {}
	for w, c in list(tokenizer.word_docs.items()):
	tokenizer.index_docs[tokenizer.word_index[w]] = c

	return tokenizer