Skip to content

Instantly share code, notes, and snippets.

@Paddy3118
Created December 20, 2017 14:57
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save Paddy3118/05413cfe104bad1002e3e145a2b88d07 to your computer and use it in GitHub Desktop.
Python flashtext speed example against dict.get
"""
Original: https://gist.github.com/vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a
Additional dict.get() word replacer added
Created on Wed Dec 20 14:03:51 2017
@author: Paddy3118
"""
#!/bin/python
from flashtext.keyword import KeywordProcessor
import random
import string
import re
import time
import gc
def get_word_of_length(str_length):
# generate a random word of given length
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))
# generate a list of 100K words of randomly chosen size
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]
print('Count | FlashText | Regex | dict.get() | Comments')
print('------------------------------------------------------')
for keywords_length in range(1, 20002, 2500):
gc.collect()
# chose 5000*10 terms and create a string to search in.
all_words_chosen = random.sample(all_words, 5000*10)
story = ' '.join(all_words_chosen)
# get unique keywords from the list of words generated.
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))
# compile regex
# source: https://stackoverflow.com/questions/6116978/python-replace-multiple-strings
rep = dict([(key, '_keyword_') for key in unique_keywords_sublist])
compiled_re = re.compile("|".join(rep.keys()))
# add keywords to flashtext
keyword_processor = KeywordProcessor()
for keyword in unique_keywords_sublist:
keyword_processor.add_keyword(keyword, '_keyword_')
gc.disable()
# time the modules
start = time.time()
# flashtext (but ommiting its keyword setup)
_1 = keyword_processor.replace_keywords(story)
mid = time.time()
# re (ommiting its regexp compilation)
_2 = compiled_re.sub(lambda m: rep[re.escape(m.group(0))], story)
end = time.time()
# dict.get(word, word) returns the original word if it is not in the dict
_3 = ' '.join(rep.get(word, word) for word in story.split())
end3 = time.time()
gc.enable()
# print output
print(str(keywords_length).ljust(6), '|',
"{0:.5f}".format(mid - start).ljust(9), '|',
"{0:.5f}".format(end - mid).ljust(9), '|',
"{0:.5f}".format(end3 - end).ljust(9), '|',
end=' ')
comment = []
if _1 != _2:
comment.append('#1 != #2')
else:
comment.append('#1 == #2')
if _1 != _3:
comment.append('#1 != #3')
else:
comment.append('#1 == #3')
if _2 != _3:
comment.append('#2 != #3')
else:
comment.append('#2 == #3')
print(' and '.join(comment))
# Sample output
#Count | FlashText | Regex | dict.get() | Comments
#------------------------------------------------------
#1 | 0.09375 | 0.00000 | 0.00000 | #1 != #2 and #1 == #3 and #2 != #3
#2501 | 0.10938 | 3.67579 | 0.00000 | #1 != #2 and #1 == #3 and #2 != #3
#5001 | 0.10939 | 6.98075 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3
#7501 | 0.09745 | 9.90468 | 0.01563 | #1 != #2 and #1 != #3 and #2 != #3
#10001 | 0.10937 | 12.43827 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3
#12501 | 0.12498 | 14.58402 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3
#15001 | 0.12500 | 16.56654 | 0.01562 | #1 != #2 and #1 == #3 and #2 != #3
#17501 | 0.10937 | 18.32079 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3
#20001 | 0.12500 | 20.04271 | 0.01563 | #1 != #2 and #1 == #3 and #2 != #3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment