Skip to content

Instantly share code, notes, and snippets.

@sadovnychyi
Last active September 20, 2017 21:22
Show Gist options
  • Save sadovnychyi/90aa96a4dbaed71a466e82cc8ebe0a35 to your computer and use it in GitHub Desktop.
Save sadovnychyi/90aa96a4dbaed71a466e82cc8ebe0a35 to your computer and use it in GitHub Desktop.
Spacy vs Acora for multi keyword search benchmarking
python3 spacy_vs_acora.py acora
Total 10036 matches in 0.4687957763671875s
Filename: spacy_vs_acora.py
Line # Mem usage Increment Line Contents
================================================
14 80.6 MiB 0.0 MiB @memory_profiler.profile()
15 def main(test):
16 88.6 MiB 8.0 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
17 87.7 MiB -0.9 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
18 87.7 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)]
19 88.6 MiB 0.9 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
20 88.6 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)]
21
22 88.6 MiB 0.0 MiB total = 0
23 88.6 MiB 0.0 MiB start = None
24
25 88.6 MiB 0.0 MiB if test == 'spacy':
26 nlp = spacy.load('en_dummy')
27 matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
28 start = time.time()
29 for text in nlp.pipe(texts):
30 total += len(matcher(text))
31 else:
32 3089.6 MiB 3001.0 MiB matcher = acora.AcoraBuilder(*phrases).build()
33 3089.6 MiB 0.0 MiB start = time.time()
34 3089.6 MiB 0.0 MiB for text in texts:
35 3089.6 MiB 0.0 MiB total += len(matcher.findall(text))
36 3089.6 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start))
python3 spacy_vs_acora.py pyahocorasick
Total 9934 matches in 0.1664886474609375s
Filename: spacy_vs_acora.py
Line # Mem usage Increment Line Contents
================================================
15 80.1 MiB 0.0 MiB @memory_profiler.profile()
16 def main(test):
17 88.1 MiB 7.9 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
18 87.1 MiB -1.0 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
19 87.1 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)]
20 88.1 MiB 1.0 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
21 88.1 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)]
22
23 88.1 MiB 0.0 MiB total = 0
24 88.1 MiB 0.0 MiB start = None
25
26 88.1 MiB 0.0 MiB if test == 'spacy':
27 nlp = spacy.load('en_dummy')
28 matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
29 start = time.time()
30 for text in nlp.pipe(texts):
31 total += len(matcher(text))
32 88.1 MiB 0.0 MiB elif test == 'pyahocorasick':
33 88.1 MiB 0.0 MiB matcher = ahocorasick.Automaton()
34 249.4 MiB 161.3 MiB for phrase in phrases:
35 249.4 MiB 0.0 MiB matcher.add_word(phrase, phrase)
36 249.6 MiB 0.2 MiB matcher.make_automaton()
37 249.6 MiB 0.0 MiB start = time.time()
38 251.0 MiB 1.4 MiB for text in texts:
39 251.0 MiB 0.0 MiB total += len(list(matcher.iter(text)))
40 else:
41 matcher = acora.AcoraBuilder(*phrases).build()
42 start = time.time()
43 for text in texts:
44 total += len(matcher.findall(text))
45 251.0 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start))
python3 spacy_vs_acora.py spacy
Warning: no model found for 'en_dummy'
Only loading the 'en' tokenizer.
Total 10050 matches in 2.5495481491088867s
Filename: spacy_vs_acora.py
Line # Mem usage Increment Line Contents
================================================
14 80.1 MiB 0.0 MiB @memory_profiler.profile()
15 def main(test):
16 88.2 MiB 8.1 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
17 87.2 MiB -1.0 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
18 87.2 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)]
19 88.2 MiB 1.0 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
20 88.2 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)]
21
22 88.2 MiB 0.0 MiB total = 0
23 88.2 MiB 0.0 MiB start = None
24
25 88.2 MiB 0.0 MiB if test == 'spacy':
26 89.6 MiB 1.4 MiB nlp = spacy.load('en_dummy')
27 337.0 MiB 247.4 MiB matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
28 337.0 MiB 0.0 MiB start = time.time()
29 343.6 MiB 6.6 MiB for text in nlp.pipe(texts):
30 343.6 MiB 0.0 MiB total += len(matcher(text))
31 else:
32 matcher = acora.AcoraBuilder(*phrases).build()
33 start = time.time()
34 for text in texts:
35 total += len(matcher.findall(text))
36 343.6 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start))
import random
import string
import acora
import spacy
import ahocorasick
import sys
import time
import memory_profiler
N_PHRASES = 100000
N_TEXTS = 1000
@memory_profiler.profile()
def main(test):
random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
phrases = [random_phrase() for i in range(N_PHRASES)]
random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
texts = [random_text() for i in range(N_TEXTS)]
total = 0
start = None
if test == 'spacy':
nlp = spacy.load('en_dummy')
matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
start = time.time()
for text in nlp.pipe(texts):
total += len(matcher(text))
elif test == 'pyahocorasick':
matcher = ahocorasick.Automaton()
for phrase in phrases:
matcher.add_word(phrase, phrase)
matcher.make_automaton()
start = time.time()
for text in texts:
total += len(list(matcher.iter(text)))
else:
matcher = acora.AcoraBuilder(*phrases).build()
start = time.time()
for text in texts:
total += len(matcher.findall(text))
print('Total %s matches in %ss' % (total, time.time() - start))
if __name__ == '__main__':
main(sys.argv[-1])
@honnibal
Copy link

Nice benchmark!

I made another version that moves the tokenization outside the timer, to check how the matcher itself performs. I might've stuffed up the benchmark, but I get 10112 matches in 0.01 seconds. Code below.

import random
import string
import sys
import time
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
from spacy.vocab import Vocab


N_PHRASES = 100000
N_TEXTS = 1000


def main():
    random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
    random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
    phrases = [random_phrase() for i in range(N_PHRASES)]
    random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
    texts = [random_text() for i in range(N_TEXTS)]

    total = 0
    start = None

    vocab = Vocab(lex_attr_getters={})
    matcher = PhraseMatcher(vocab)
    patterns = (Doc(vocab, words=phrase.split()) for phrase in phrases)
    matcher.add('Pattern', None, *patterns)
    docs = [Doc(vocab, words=text.split()) for text in texts]
    start = time.time()
    for doc in docs:
        total += len(matcher(doc))
    print('Total %s matches in %ss' % (total, time.time() - start))


if __name__ == '__main__':
  main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment