Created
December 20, 2017 14:57
-
-
Save Paddy3118/05413cfe104bad1002e3e145a2b88d07 to your computer and use it in GitHub Desktop.
Python flashtext speed example against dict.get
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Original: https://gist.github.com/vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a | |
Additional dict.get() word replacer added | |
Created on Wed Dec 20 14:03:51 2017 | |
@author: Paddy3118 | |
""" | |
#!/bin/python | |
from flashtext.keyword import KeywordProcessor | |
import random | |
import string | |
import re | |
import time | |
import gc | |
def get_word_of_length(str_length): | |
# generate a random word of given length | |
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length)) | |
# generate a list of 100K words of randomly chosen size | |
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)] | |
print('Count | FlashText | Regex | dict.get() | Comments') | |
print('------------------------------------------------------') | |
for keywords_length in range(1, 20002, 2500): | |
gc.collect() | |
# chose 5000*10 terms and create a string to search in. | |
all_words_chosen = random.sample(all_words, 5000*10) | |
story = ' '.join(all_words_chosen) | |
# get unique keywords from the list of words generated. | |
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length))) | |
# compile regex | |
# source: https://stackoverflow.com/questions/6116978/python-replace-multiple-strings | |
rep = dict([(key, '_keyword_') for key in unique_keywords_sublist]) | |
compiled_re = re.compile("|".join(rep.keys())) | |
# add keywords to flashtext | |
keyword_processor = KeywordProcessor() | |
for keyword in unique_keywords_sublist: | |
keyword_processor.add_keyword(keyword, '_keyword_') | |
gc.disable() | |
# time the modules | |
start = time.time() | |
# flashtext (but ommiting its keyword setup) | |
_1 = keyword_processor.replace_keywords(story) | |
mid = time.time() | |
# re (ommiting its regexp compilation) | |
_2 = compiled_re.sub(lambda m: rep[re.escape(m.group(0))], story) | |
end = time.time() | |
# dict.get(word, word) returns the original word if it is not in the dict | |
_3 = ' '.join(rep.get(word, word) for word in story.split()) | |
end3 = time.time() | |
gc.enable() | |
# print output | |
print(str(keywords_length).ljust(6), '|', | |
"{0:.5f}".format(mid - start).ljust(9), '|', | |
"{0:.5f}".format(end - mid).ljust(9), '|', | |
"{0:.5f}".format(end3 - end).ljust(9), '|', | |
end=' ') | |
comment = [] | |
if _1 != _2: | |
comment.append('#1 != #2') | |
else: | |
comment.append('#1 == #2') | |
if _1 != _3: | |
comment.append('#1 != #3') | |
else: | |
comment.append('#1 == #3') | |
if _2 != _3: | |
comment.append('#2 != #3') | |
else: | |
comment.append('#2 == #3') | |
print(' and '.join(comment)) | |
# Sample output | |
#Count | FlashText | Regex | dict.get() | Comments | |
#------------------------------------------------------ | |
#1 | 0.09375 | 0.00000 | 0.00000 | #1 != #2 and #1 == #3 and #2 != #3 | |
#2501 | 0.10938 | 3.67579 | 0.00000 | #1 != #2 and #1 == #3 and #2 != #3 | |
#5001 | 0.10939 | 6.98075 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 | |
#7501 | 0.09745 | 9.90468 | 0.01563 | #1 != #2 and #1 != #3 and #2 != #3 | |
#10001 | 0.10937 | 12.43827 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 | |
#12501 | 0.12498 | 14.58402 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 | |
#15001 | 0.12500 | 16.56654 | 0.01562 | #1 != #2 and #1 == #3 and #2 != #3 | |
#17501 | 0.10937 | 18.32079 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 | |
#20001 | 0.12500 | 20.04271 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment