Paddy3118/flashtext_regex_dict_timing.py

## flashtext_regex_dict_timing.py
"""

Original: https://gist.github.com/vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a

Additional dict.get() word replacer added

Created on Wed Dec 20 14:03:51 2017

@author: Paddy3118
"""

#!/bin/python
from flashtext.keyword import KeywordProcessor
import random
import string
import re
import time
import gc


def get_word_of_length(str_length):
    # generate a random word of given length
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))

# generate a list of 100K words of randomly chosen size
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]

print('Count  | FlashText | Regex    | dict.get() | Comments')
print('------------------------------------------------------')
for keywords_length in range(1, 20002, 2500):
    gc.collect()
    # chose 5000*10 terms and create a string to search in.
    all_words_chosen = random.sample(all_words, 5000*10)
    story = ' '.join(all_words_chosen)

    # get unique keywords from the list of words generated.
    unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))

    # compile regex
    # source: https://stackoverflow.com/questions/6116978/python-replace-multiple-strings
    rep = dict([(key, '_keyword_') for key in unique_keywords_sublist])
    compiled_re = re.compile("|".join(rep.keys()))

    # add keywords to flashtext
    keyword_processor = KeywordProcessor()
    for keyword in unique_keywords_sublist:
        keyword_processor.add_keyword(keyword, '_keyword_')

    gc.disable()
    # time the modules
    start = time.time()
    # flashtext (but ommiting its keyword setup)
    _1 = keyword_processor.replace_keywords(story)
    mid = time.time()
    # re (ommiting its regexp compilation)
    _2 = compiled_re.sub(lambda m: rep[re.escape(m.group(0))], story)
    end = time.time()
    # dict.get(word, word) returns the original word if it is not in the dict
    _3 = ' '.join(rep.get(word, word) for word in story.split())
    end3 = time.time()

    gc.enable()
    # print output
    print(str(keywords_length).ljust(6), '|',
          "{0:.5f}".format(mid - start).ljust(9), '|',
          "{0:.5f}".format(end - mid).ljust(9), '|',
          "{0:.5f}".format(end3 - end).ljust(9), '|',
          end=' ')
    comment = []
    if _1 != _2:
        comment.append('#1 != #2')
    else:
        comment.append('#1 == #2')
    if _1 != _3:
        comment.append('#1 != #3')
    else:
        comment.append('#1 == #3')
    if _2 != _3:
        comment.append('#2 != #3')
    else:
        comment.append('#2 == #3')
    print(' and '.join(comment))

# Sample output
#Count  | FlashText | Regex    | dict.get() | Comments
#------------------------------------------------------
#1      | 0.09375   | 0.00000   | 0.00000   | #1 != #2 and #1 == #3 and #2 != #3
#2501   | 0.10938   | 3.67579   | 0.00000   | #1 != #2 and #1 == #3 and #2 != #3
#5001   | 0.10939   | 6.98075   | 0.01563   | #1 != #2 and #1 == #3 and #2 != #3
#7501   | 0.09745   | 9.90468   | 0.01563   | #1 != #2 and #1 != #3 and #2 != #3
#10001  | 0.10937   | 12.43827  | 0.01563   | #1 != #2 and #1 == #3 and #2 != #3
#12501  | 0.12498   | 14.58402  | 0.01563   | #1 != #2 and #1 == #3 and #2 != #3
#15001  | 0.12500   | 16.56654  | 0.01562   | #1 != #2 and #1 == #3 and #2 != #3
#17501  | 0.10937   | 18.32079  | 0.01563   | #1 != #2 and #1 == #3 and #2 != #3
#20001  | 0.12500   | 20.04271  | 0.01563   | #1 != #2 and #1 == #3 and #2 != #3
	"""

	Original: https://gist.github.com/vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a

	Additional dict.get() word replacer added

	Created on Wed Dec 20 14:03:51 2017

	@author: Paddy3118
	"""

	#!/bin/python
	from flashtext.keyword import KeywordProcessor
	import random
	import string
	import re
	import time
	import gc


	def get_word_of_length(str_length):
	# generate a random word of given length
	return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))

	# generate a list of 100K words of randomly chosen size
	all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]

	print('Count \| FlashText \| Regex \| dict.get() \| Comments')
	print('------------------------------------------------------')
	for keywords_length in range(1, 20002, 2500):
	gc.collect()
	# chose 5000*10 terms and create a string to search in.
	all_words_chosen = random.sample(all_words, 5000*10)
	story = ' '.join(all_words_chosen)

	# get unique keywords from the list of words generated.
	unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))

	# compile regex
	# source: https://stackoverflow.com/questions/6116978/python-replace-multiple-strings
	rep = dict([(key, '_keyword_') for key in unique_keywords_sublist])
	compiled_re = re.compile("\|".join(rep.keys()))

	# add keywords to flashtext
	keyword_processor = KeywordProcessor()
	for keyword in unique_keywords_sublist:
	keyword_processor.add_keyword(keyword, '_keyword_')

	gc.disable()
	# time the modules
	start = time.time()
	# flashtext (but ommiting its keyword setup)
	_1 = keyword_processor.replace_keywords(story)
	mid = time.time()
	# re (ommiting its regexp compilation)
	_2 = compiled_re.sub(lambda m: rep[re.escape(m.group(0))], story)
	end = time.time()
	# dict.get(word, word) returns the original word if it is not in the dict
	_3 = ' '.join(rep.get(word, word) for word in story.split())
	end3 = time.time()

	gc.enable()
	# print output
	print(str(keywords_length).ljust(6), '\|',
	"{0:.5f}".format(mid - start).ljust(9), '\|',
	"{0:.5f}".format(end - mid).ljust(9), '\|',
	"{0:.5f}".format(end3 - end).ljust(9), '\|',
	end=' ')
	comment = []
	if _1 != _2:
	comment.append('#1 != #2')
	else:
	comment.append('#1 == #2')
	if _1 != _3:
	comment.append('#1 != #3')
	else:
	comment.append('#1 == #3')
	if _2 != _3:
	comment.append('#2 != #3')
	else:
	comment.append('#2 == #3')
	print(' and '.join(comment))

	# Sample output
	#Count \| FlashText \| Regex \| dict.get() \| Comments
	#------------------------------------------------------
	#1 \| 0.09375 \| 0.00000 \| 0.00000 \| #1 != #2 and #1 == #3 and #2 != #3
	#2501 \| 0.10938 \| 3.67579 \| 0.00000 \| #1 != #2 and #1 == #3 and #2 != #3
	#5001 \| 0.10939 \| 6.98075 \| 0.01563 \| #1 != #2 and #1 == #3 and #2 != #3
	#7501 \| 0.09745 \| 9.90468 \| 0.01563 \| #1 != #2 and #1 != #3 and #2 != #3
	#10001 \| 0.10937 \| 12.43827 \| 0.01563 \| #1 != #2 and #1 == #3 and #2 != #3
	#12501 \| 0.12498 \| 14.58402 \| 0.01563 \| #1 != #2 and #1 == #3 and #2 != #3
	#15001 \| 0.12500 \| 16.56654 \| 0.01562 \| #1 != #2 and #1 == #3 and #2 != #3
	#17501 \| 0.10937 \| 18.32079 \| 0.01563 \| #1 != #2 and #1 == #3 and #2 != #3
	#20001 \| 0.12500 \| 20.04271 \| 0.01563 \| #1 != #2 and #1 == #3 and #2 != #3