Skip to content

Instantly share code, notes, and snippets.

@blcarlson01
Last active April 14, 2023 12:18
Show Gist options
  • Save blcarlson01/e59d891ad799b8a5f009abad2076c88c to your computer and use it in GitHub Desktop.
Save blcarlson01/e59d891ad799b8a5f009abad2076c88c to your computer and use it in GitHub Desktop.
Compare Regex Methods
import timeit
from statistics import mean, stdev
import math
import matplotlib.pyplot as plt
import re2
from flashtext import KeywordProcessor
import ahocorasick
import re
from scipy import stats
import time
def run_re(keywords, text, pattern):
matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
return len(matches)
def run_re2(keywords, text, pattern):
matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
return len(matches)
def run_flashtext(keywords, text, processor):
matches = [(match) for match in processor.extract_keywords(text,span_info=True)]
return len(matches)
def run_pyahocorasick(keywords, text, automaton):
matches = [(match[0], match[0] + len(match[1])) for match in automaton.iter(text)]
return len(matches)
def generate_summary(keywords, texts):
methods = [
("re", run_re),
("re2", run_re2),
("flashtext", run_flashtext),
("pyahocorasick", run_pyahocorasick)
]
summaries = {}
for method_name, method_func in methods:
times = []
if method_name == 'flashtext':
print('flash')
processor = KeywordProcessor()
for keyword in keywords:
processor.add_keyword(keyword)
method_cache = processor
elif method_name == 'pyahocorasick':
print('py')
automaton = ahocorasick.Automaton()
for keyword in keywords:
automaton.add_word(keyword, keyword)
automaton.make_automaton()
method_cache = automaton
elif method_name == 're':
method_cache = re.compile('|'.join(keywords))
elif method_name == 're2':
method_cache = re2.compile('|'.join(keywords))
else:
print('no cache')
method_cache = ''
for text in texts:
start_time = time.perf_counter()
result = method_func(keywords, text, method_cache)
end_time = time.perf_counter()
elapsed_time = end_time - start_time
times.append(elapsed_time)
sum_time = sum(times)
avg_time = mean(times)
min_time = min(times)
max_time = max(times)
std_dev = stdev(times)
confidence_interval = stats.t.interval(0.95, len(times)-1, loc=avg_time, scale=stats.sem(times))
summaries[method_name] = {
"sum_time" : sum_time,
"average_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"std_dev": std_dev,
"confidence_interval": confidence_interval
}
#print(f"\nSummary for method: {method_name}")
#print(f"Total Time: {sum_time:.6f} seconds")
#print(f"Average Time: {avg_time:.6f} seconds")
#print(f"Minimum Time: {min_time:.6f} seconds")
#print(f"Maximum Time: {max_time:.6f} seconds")
#print(f"Standard Deviation: {std_dev:.6f}")
#print(f"95% Confidence Interval: ({confidence_interval[0]:.6f}, {confidence_interval[1]:.6f})")
return summaries
def plot_summary(summary):
x_values = list(summary.keys())
y_values = [data['average_time'] for data in summary.values()]
fig, ax = plt.subplots()
ax.bar(x_values, y_values)
ax.set_ylabel('Average Time')
ax.set_xlabel('Method')
ax.set_title('Average Time of Finding a Keyword per Method')
plt.show()
def plot_total_time(summary):
x_values = list(summary.keys())
y_values = [data['sum_time'] for data in summary.values()]
fig, ax = plt.subplots()
ax.bar(x_values, y_values)
ax.set_ylabel('Total Time')
ax.set_xlabel('Method')
ax.set_title('Total Time Per Method')
plt.show()
def find_fastest_method(summary):
fastest_time = float('inf')
fastest_method = ''
for method, data in summary.items():
avg_time = data['average_time']
if avg_time < fastest_time:
fastest_time = avg_time
fastest_method = method
print(f"\nThe fastest method is {fastest_method} with an average time of {fastest_time:.6f} seconds")
for method, data in summary.items():
if method != fastest_method:
avg_time = data['average_time']
speedup = (avg_time - fastest_time) / fastest_time * 100
print(f"{method} is {speedup:.2f}% slower than {fastest_method}")
import matplotlib.pyplot as plt
def plot_avg_string_length(texts):
avg_lengths = []
for text in texts:
avg_lengths.append(len(text))
plt.plot(avg_lengths)
plt.title('String Length over Message')
plt.xlabel('Message')
plt.ylabel('String Length')
plt.show()
from prettytable import PrettyTable
from math import ceil
def generate_table(summary):
table = PrettyTable()
# Determine the fastest method
fastest_method = min(summary.keys(), key=lambda x: summary[x]['average_time'])
# Sort methods by average time
sorted_methods = sorted(summary.keys(), key=lambda x: summary[x]['average_time'], reverse=True)
table.field_names = ['Method', 'Total Time', 'Average Time', 'Min Time', 'Max Time', 'Std Dev', 'Confidence Interval']
for i, method in enumerate(sorted_methods):
rank = i + 1
data = summary[method]
sum_time = f"{round(data['sum_time'], 6)} sec"
avg_time = f"{round(data['average_time'], 6)} sec"
min_time = f"{round(data['min_time'], 6)} sec"
max_time = f"{round(data['max_time'], 6)} sec"
std_dev = f"{round(data['std_dev'], 6)} sec"
conf_interval = data['confidence_interval']
# Highlight the fastest method in the table
if method == fastest_method:
table.add_row([f"*{method}", sum_time, avg_time, min_time, max_time, std_dev, conf_interval])
else:
table.add_row([method, sum_time, avg_time, min_time, max_time, std_dev, conf_interval])
print(table)
from lorem_text import lorem
from faker import Faker
import random
def main():
keywords = ['d', 'ipsum', 'dolor', 'sit', 'amet']
fake = Faker()
keywords += [fake.word() for _ in range(100)]
texts = [lorem.paragraph() for _ in range(100000)]
texts.append(test)
summary = generate_summary(keywords, texts)
find_fastest_method(summary)
plot_summary(summary)
plot_total_time(summary)
plot_avg_string_length(texts)
generate_table(summary)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment