Skip to content

Instantly share code, notes, and snippets.

@vi3k6i5
Created December 12, 2017 16:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vi3k6i5/4ea37490cddf6d8b4a1daf13f6e51457 to your computer and use it in GitHub Desktop.
Save vi3k6i5/4ea37490cddf6d8b4a1daf13f6e51457 to your computer and use it in GitHub Desktop.
Compare flashtext to Trie re
import re
class Trie():
"""Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
The corresponding Regex should match much faster than a simple Regex union."""
def __init__(self):
self.data = {}
def add(self, word):
ref = self.data
for char in word:
ref[char] = char in ref and ref[char] or {}
ref = ref[char]
ref[''] = 1
def dump(self):
return self.data
def quote(self, char):
return re.escape(char)
def _pattern(self, pData):
data = pData
if "" in data and len(data.keys()) == 1:
return None
alt = []
cc = []
q = 0
for char in sorted(data.keys()):
if isinstance(data[char], dict):
try:
recurse = self._pattern(data[char])
alt.append(self.quote(char) + recurse)
except:
cc.append(self.quote(char))
else:
q = 1
cconly = not len(alt) > 0
if len(cc) > 0:
if len(cc) == 1:
alt.append(cc[0])
else:
alt.append('[' + ''.join(cc) + ']')
if len(alt) == 1:
result = alt[0]
else:
result = "(?:" + "|".join(alt) + ")"
if q:
if cconly:
result += "?"
else:
result = "(?:%s)?" % result
return result
def pattern(self):
return self._pattern(self.dump())
import string
import re
import timeit
import random
def get_word_of_length(str_length):
# generate a random word of given length
return ''.join(random.choice(string.ascii_lowercase + ' +-\\') for _ in range(str_length))
# generate a list of 100K words of randomly chosen size
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]
random.shuffle(all_words)
def trie_regex_from_words(words):
trie = Trie()
for word in words:
trie.add(word)
return re.compile(r"\b" + trie.pattern() + r"\b", re.IGNORECASE)
def find(word):
def fun():
return union.match(word)
return fun
#!/bin/python
from flashtext.keyword import KeywordProcessor
import random
import string
import re
import time
print('Count | FlashText | Regex ')
print('-------------------------------')
for keywords_length in range(0, 20001, 1000):
# chose 5000 terms and create a string to search in.
all_words_chosen = random.sample(all_words, 5000)
story = ' '.join(all_words_chosen)
# get unique keywords from the list of words generated.
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))
# compile regex
compiled_re = trie_regex_from_words(unique_keywords_sublist)
# add keywords to flashtext
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(unique_keywords_sublist)
# time the modules
start = time.time()
_ = keyword_processor.extract_keywords(story)
mid = time.time()
_ = compiled_re.findall(story)
end = time.time()
# print output
print(str(keywords_length).ljust(6), '|',
"{0:.5f}".format(mid - start).ljust(9), '|',
"{0:.5f}".format(end - mid).ljust(9), '|',)
Count | FlashText | Regex
-------------------------------
0 | 0.01879 | 0.00258 |
1000 | 0.02584 | 0.01178 |
5000 | 0.02841 | 0.01469 |
10000 | 0.02966 | 0.01558 |
15000 | 0.03217 | 0.01875 |
20000 | 0.03328 | 0.01762 |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment