Skip to content

Instantly share code, notes, and snippets.

@skohari
Created December 15, 2022 18:17
Show Gist options
  • Save skohari/7c0651fa020de4d9d00b1440e75904a9 to your computer and use it in GitHub Desktop.
Save skohari/7c0651fa020de4d9d00b1440e75904a9 to your computer and use it in GitHub Desktop.
# flashtext_regex_timing_keyword_extraction.py
from flashtext.keyword import KeywordProcessor
import random
import string
import re
import time
def get_word_of_length(str_length):
# generate a random word of given length
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))
# generate a list of n=60K words of randomly chosen size
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(6_000_000)]
print('Count | FlashText | Regex ')
print('-------------------------------')
for awc in range(0, 1_500_000, 50_000):
# chose n terms and create a string to search in.
all_words_chosen = random.sample(all_words, awc)
story = ' '.join(all_words_chosen)
# get unique keywords from the list of words generated.
unique_keywords_sublist = list(set(random.sample(all_words, 200)))
# compile regex
compiled_re = re.compile('|'.join([r'\b' + keyword + r'\b' for keyword in unique_keywords_sublist]))
# add keywords to flashtext
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(unique_keywords_sublist)
# time the modules
start = time.time()
_ = keyword_processor.extract_keywords(story)
mid = time.time()
_ = compiled_re.findall(story)
end = time.time()
# print output
print(str(awc).ljust(6), '|',
"{0:.5f}".format(mid - start).ljust(9), '|',
"{0:.5f}".format(end - mid).ljust(9), '|',)
# Count | FlashText | Regex
# -------------------------------
# 0 | 0.01668 | 0.00418 |
# 1000 | 0.02040 | 0.04781 |
# 5000 | 0.02180 | 0.26495 |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment