Python flashtext speed example against dict.get
""" | |
Original: https://gist.github.com/vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a | |
Additional dict.get() word replacer added | |
Created on Wed Dec 20 14:03:51 2017 | |
@author: Paddy3118 | |
""" | |
#!/bin/python | |
from flashtext.keyword import KeywordProcessor | |
import random | |
import string | |
import re | |
import time | |
import gc | |
def get_word_of_length(str_length): | |
# generate a random word of given length | |
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length)) | |
# generate a list of 100K words of randomly chosen size | |
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)] | |
print('Count | FlashText | Regex | dict.get() | Comments') | |
print('------------------------------------------------------') | |
for keywords_length in range(1, 20002, 2500): | |
gc.collect() | |
# chose 5000*10 terms and create a string to search in. | |
all_words_chosen = random.sample(all_words, 5000*10) | |
story = ' '.join(all_words_chosen) | |
# get unique keywords from the list of words generated. | |
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length))) | |
# compile regex | |
# source: https://stackoverflow.com/questions/6116978/python-replace-multiple-strings | |
rep = dict([(key, '_keyword_') for key in unique_keywords_sublist]) | |
compiled_re = re.compile("|".join(rep.keys())) | |
# add keywords to flashtext | |
keyword_processor = KeywordProcessor() | |
for keyword in unique_keywords_sublist: | |
keyword_processor.add_keyword(keyword, '_keyword_') | |
gc.disable() | |
# time the modules | |
start = time.time() | |
# flashtext (but ommiting its keyword setup) | |
_1 = keyword_processor.replace_keywords(story) | |
mid = time.time() | |
# re (ommiting its regexp compilation) | |
_2 = compiled_re.sub(lambda m: rep[re.escape(m.group(0))], story) | |
end = time.time() | |
# dict.get(word, word) returns the original word if it is not in the dict | |
_3 = ' '.join(rep.get(word, word) for word in story.split()) | |
end3 = time.time() | |
gc.enable() | |
# print output | |
print(str(keywords_length).ljust(6), '|', | |
"{0:.5f}".format(mid - start).ljust(9), '|', | |
"{0:.5f}".format(end - mid).ljust(9), '|', | |
"{0:.5f}".format(end3 - end).ljust(9), '|', | |
end=' ') | |
comment = [] | |
if _1 != _2: | |
comment.append('#1 != #2') | |
else: | |
comment.append('#1 == #2') | |
if _1 != _3: | |
comment.append('#1 != #3') | |
else: | |
comment.append('#1 == #3') | |
if _2 != _3: | |
comment.append('#2 != #3') | |
else: | |
comment.append('#2 == #3') | |
print(' and '.join(comment)) | |
# Sample output | |
#Count | FlashText | Regex | dict.get() | Comments | |
#------------------------------------------------------ | |
#1 | 0.09375 | 0.00000 | 0.00000 | #1 != #2 and #1 == #3 and #2 != #3 | |
#2501 | 0.10938 | 3.67579 | 0.00000 | #1 != #2 and #1 == #3 and #2 != #3 | |
#5001 | 0.10939 | 6.98075 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 | |
#7501 | 0.09745 | 9.90468 | 0.01563 | #1 != #2 and #1 != #3 and #2 != #3 | |
#10001 | 0.10937 | 12.43827 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 | |
#12501 | 0.12498 | 14.58402 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 | |
#15001 | 0.12500 | 16.56654 | 0.01562 | #1 != #2 and #1 == #3 and #2 != #3 | |
#17501 | 0.10937 | 18.32079 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 | |
#20001 | 0.12500 | 20.04271 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment