Skip to content

Instantly share code, notes, and snippets.

@vi3k6i5
Created November 14, 2017 16:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vi3k6i5/06b12bd2c5abde6de9dcfa43abc3d362 to your computer and use it in GitHub Desktop.
Save vi3k6i5/06b12bd2c5abde6de9dcfa43abc3d362 to your computer and use it in GitHub Desktop.
Comparing flashtext with a cython implementation of similar algo
#!/bin/python
from flashtext.keyword import KeywordProcessor
import random
import string
import re
from automaton import Automaton
import time
def get_word_of_length(str_length):
# generate a random word of given length
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))
# generate a list of 100K words of randomly chosen size
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]
print('Count | FlashText | Automaton ')
print('-------------------------------')
for keywords_length in range(0, 20001, 1000):
# chose 5000 terms and create a string to search in.
all_words_chosen = random.sample(all_words, 5000)
story = ' '.join(all_words_chosen)
# get unique keywords from the list of words generated.
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))
# compile Automaton
A = Automaton(ignoreAccents=True, ignoreCase=True)
dictionary = {}
for val in unique_keywords_sublist:
dictionary[val] = [val]
A.build(dictionary)
# add keywords to flashtext
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(unique_keywords_sublist)
# time the modules
start = time.time()
_ = keyword_processor.extract_keywords(story)
mid = time.time()
_ = A.read(story)
end = time.time()
# print output
print(str(keywords_length).ljust(6), '|',
"{0:.5f}".format(mid - start).ljust(9), '|',
"{0:.5f}".format(end - mid).ljust(9), '|',)
# Output:
# Count | FlashText | Automaton
# -------------------------------
# 0 | 0.01584 | 0.02289 |
# 1000 | 0.02029 | 0.05322 |
# 2000 | 0.01965 | 0.05194 |
# 3000 | 0.02026 | 0.06139 |
# 4000 | 0.02118 | 0.07087 |
# 5000 | 0.02883 | 0.06348 |
# 6000 | 0.02281 | 0.12364 |
# 7000 | 0.02209 | 0.06058 |
# 8000 | 0.01994 | 0.06167 |
# 9000 | 0.02393 | 0.11298 |
# 10000 | 0.02939 | 0.07494 |
# 11000 | 0.02433 | 0.07365 |
# 12000 | 0.02576 | 0.07373 |
# 13000 | 0.02193 | 0.06714 |
# 14000 | 0.02301 | 0.07322 |
# 15000 | 0.02546 | 0.07115 |
# 16000 | 0.02495 | 0.13117 |
# 17000 | 0.02705 | 0.15396 |
# 18000 | 0.02646 | 0.08084 |
# 19000 | 0.02621 | 0.08861 |
# 20000 | 0.02399 | 0.07886 |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment