Created
February 16, 2022 17:24
-
-
Save tyteen4a03/7d22690c594ff0fbca0d175327c7f0ca to your computer and use it in GitHub Desktop.
Corporate Clash Blocklist Benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import re2 | |
import timeit | |
from statistics import stdev, fmean | |
from typing import Callable, List | |
from trieregex import TrieRegEx | |
from blocklist import BLOCKLIST, EXPECTED_RESULTS | |
def naive_blocklist_scanning(message: str) -> bool: | |
lowercase_message = message.lower() | |
for word in BLOCKLIST: | |
if re.search(r'\b{}\b'.format(word), lowercase_message): | |
return True | |
return False | |
precompiled_blocklist_regex = [re.compile(r'\b{}\b'.format(pattern)) for pattern in BLOCKLIST] | |
def precompiled_patterns_blocklist_scanning(message: str) -> bool: | |
lowercase_message = message.lower() | |
for bl_phrase in precompiled_blocklist_regex: | |
if bl_phrase.search(lowercase_message) is not None: | |
return True | |
return False | |
regex = re.compile(r'\b({})\b'.format('|'.join(BLOCKLIST))) | |
def precompiled_giant_pattern_re_blocklist_scanning(message: str) -> bool: | |
lowercase_message = message.lower() | |
return regex.search(lowercase_message) is not None | |
regex_re2 = re2.compile(r'\b({})\b'.format('|'.join(BLOCKLIST))) | |
def precompiled_giant_pattern_re2_blocklist_scanning(message: str) -> bool: | |
lowercase_message = message.lower() | |
return regex_re2.search(lowercase_message) is not None | |
blocklistTrie = TrieRegEx() | |
blocklistTrie.add(*BLOCKLIST) | |
BLOCKLIST_PATTERN = re.compile(rf'\b{blocklistTrie.regex()}\b') | |
BLOCKLIST_PATTERN_RE2 = re2.compile(rf'\b{blocklistTrie.regex()}\b') | |
def trie_blocklist_scanning(message: str) -> bool: | |
lowercase_message = message.lower() | |
if BLOCKLIST_PATTERN.search(lowercase_message) is not None: | |
return True | |
return False | |
def trie_re2_scanning(message: str) -> bool: | |
lowercase_message = message.lower() | |
if BLOCKLIST_PATTERN_RE2.search(lowercase_message) is not None: | |
return True | |
return False | |
def create_test(blocklist_func: Callable[[str], bool]) -> Callable[[], None]: | |
def inner(): | |
for message, expected_result in EXPECTED_RESULTS.items(): | |
result = blocklist_func(message) | |
if result != expected_result: | |
raise Exception( | |
f"incorrect result - for string '{message}' expecting {expected_result} but got {result}") | |
return inner | |
naive_timer = timeit.Timer(create_test(naive_blocklist_scanning)) | |
precompiled_timer = timeit.Timer(create_test(precompiled_patterns_blocklist_scanning)) | |
precompiled_giant_re_timer = timeit.Timer(create_test(precompiled_giant_pattern_re_blocklist_scanning)) | |
precompiled_giant_re2_timer = timeit.Timer(create_test(precompiled_giant_pattern_re2_blocklist_scanning)) | |
tries_timer = timeit.Timer(create_test(trie_blocklist_scanning)) | |
tries_re2_timer = timeit.Timer(create_test(trie_re2_scanning)) | |
RUNS = 10 | |
REPETITIONS = 10 | |
print(f"Configuration: {REPETITIONS} reps, {RUNS} runs") | |
print("Starting naive...") | |
naive_times = naive_timer.repeat(RUNS, REPETITIONS) | |
print("Starting precompiled...") | |
precompiled_times = precompiled_timer.repeat(RUNS, REPETITIONS) | |
print("Starting precompiled giant re...") | |
precompiled_giant_re_times = precompiled_giant_re_timer.repeat(RUNS, REPETITIONS) | |
print("Starting precompiled giant re2...") | |
precompiled_giant_re2_times = precompiled_giant_re2_timer.repeat(RUNS, REPETITIONS) | |
print("Starting tries...") | |
tries_times = tries_timer.repeat(RUNS, REPETITIONS) | |
print("Starting tries re2...") | |
tries_re2_times = tries_re2_timer.repeat(RUNS, REPETITIONS) | |
print(f"Naive: max {max(naive_times)}, min {min(naive_times)}, avg {fmean(naive_times)}, std {stdev(naive_times)}") | |
print(f"Precompiled: max {max(precompiled_times)}, min {min(precompiled_times)}, avg {fmean(precompiled_times)}, std {stdev(precompiled_times)}") | |
print(f"Precompiled (giant re): max {max(precompiled_giant_re_times)}, min {min(precompiled_giant_re_times)}, avg {fmean(precompiled_giant_re_times)}, std {stdev(precompiled_giant_re_times)}") | |
print(f"Precompiled (giant re2): max {max(precompiled_giant_re2_times)}, min {min(precompiled_giant_re2_times)}, avg {fmean(precompiled_giant_re2_times)}, std {stdev(precompiled_giant_re2_times)}") | |
print(f"Tries: max {max(tries_times)}, min {min(tries_times)}, avg {fmean(tries_times)}, std {format(stdev(tries_times), '.10f')}") | |
print(f"Tries re2: max {max(tries_re2_times)}, min {min(tries_re2_times)}, avg {fmean(tries_re2_times)}, std {stdev(tries_re2_times)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment