Skip to content

Instantly share code, notes, and snippets.

@tyteen4a03
Created February 16, 2022 17:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tyteen4a03/7d22690c594ff0fbca0d175327c7f0ca to your computer and use it in GitHub Desktop.
Save tyteen4a03/7d22690c594ff0fbca0d175327c7f0ca to your computer and use it in GitHub Desktop.
Corporate Clash Blocklist Benchmark
import re
import re2
import timeit
from statistics import stdev, fmean
from typing import Callable, List
from trieregex import TrieRegEx
from blocklist import BLOCKLIST, EXPECTED_RESULTS
def naive_blocklist_scanning(message: str) -> bool:
lowercase_message = message.lower()
for word in BLOCKLIST:
if re.search(r'\b{}\b'.format(word), lowercase_message):
return True
return False
precompiled_blocklist_regex = [re.compile(r'\b{}\b'.format(pattern)) for pattern in BLOCKLIST]
def precompiled_patterns_blocklist_scanning(message: str) -> bool:
lowercase_message = message.lower()
for bl_phrase in precompiled_blocklist_regex:
if bl_phrase.search(lowercase_message) is not None:
return True
return False
regex = re.compile(r'\b({})\b'.format('|'.join(BLOCKLIST)))
def precompiled_giant_pattern_re_blocklist_scanning(message: str) -> bool:
lowercase_message = message.lower()
return regex.search(lowercase_message) is not None
regex_re2 = re2.compile(r'\b({})\b'.format('|'.join(BLOCKLIST)))
def precompiled_giant_pattern_re2_blocklist_scanning(message: str) -> bool:
lowercase_message = message.lower()
return regex_re2.search(lowercase_message) is not None
blocklistTrie = TrieRegEx()
blocklistTrie.add(*BLOCKLIST)
BLOCKLIST_PATTERN = re.compile(rf'\b{blocklistTrie.regex()}\b')
BLOCKLIST_PATTERN_RE2 = re2.compile(rf'\b{blocklistTrie.regex()}\b')
def trie_blocklist_scanning(message: str) -> bool:
lowercase_message = message.lower()
if BLOCKLIST_PATTERN.search(lowercase_message) is not None:
return True
return False
def trie_re2_scanning(message: str) -> bool:
lowercase_message = message.lower()
if BLOCKLIST_PATTERN_RE2.search(lowercase_message) is not None:
return True
return False
def create_test(blocklist_func: Callable[[str], bool]) -> Callable[[], None]:
def inner():
for message, expected_result in EXPECTED_RESULTS.items():
result = blocklist_func(message)
if result != expected_result:
raise Exception(
f"incorrect result - for string '{message}' expecting {expected_result} but got {result}")
return inner
naive_timer = timeit.Timer(create_test(naive_blocklist_scanning))
precompiled_timer = timeit.Timer(create_test(precompiled_patterns_blocklist_scanning))
precompiled_giant_re_timer = timeit.Timer(create_test(precompiled_giant_pattern_re_blocklist_scanning))
precompiled_giant_re2_timer = timeit.Timer(create_test(precompiled_giant_pattern_re2_blocklist_scanning))
tries_timer = timeit.Timer(create_test(trie_blocklist_scanning))
tries_re2_timer = timeit.Timer(create_test(trie_re2_scanning))
RUNS = 10
REPETITIONS = 10
print(f"Configuration: {REPETITIONS} reps, {RUNS} runs")
print("Starting naive...")
naive_times = naive_timer.repeat(RUNS, REPETITIONS)
print("Starting precompiled...")
precompiled_times = precompiled_timer.repeat(RUNS, REPETITIONS)
print("Starting precompiled giant re...")
precompiled_giant_re_times = precompiled_giant_re_timer.repeat(RUNS, REPETITIONS)
print("Starting precompiled giant re2...")
precompiled_giant_re2_times = precompiled_giant_re2_timer.repeat(RUNS, REPETITIONS)
print("Starting tries...")
tries_times = tries_timer.repeat(RUNS, REPETITIONS)
print("Starting tries re2...")
tries_re2_times = tries_re2_timer.repeat(RUNS, REPETITIONS)
print(f"Naive: max {max(naive_times)}, min {min(naive_times)}, avg {fmean(naive_times)}, std {stdev(naive_times)}")
print(f"Precompiled: max {max(precompiled_times)}, min {min(precompiled_times)}, avg {fmean(precompiled_times)}, std {stdev(precompiled_times)}")
print(f"Precompiled (giant re): max {max(precompiled_giant_re_times)}, min {min(precompiled_giant_re_times)}, avg {fmean(precompiled_giant_re_times)}, std {stdev(precompiled_giant_re_times)}")
print(f"Precompiled (giant re2): max {max(precompiled_giant_re2_times)}, min {min(precompiled_giant_re2_times)}, avg {fmean(precompiled_giant_re2_times)}, std {stdev(precompiled_giant_re2_times)}")
print(f"Tries: max {max(tries_times)}, min {min(tries_times)}, avg {fmean(tries_times)}, std {format(stdev(tries_times), '.10f')}")
print(f"Tries re2: max {max(tries_re2_times)}, min {min(tries_re2_times)}, avg {fmean(tries_re2_times)}, std {stdev(tries_re2_times)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment