blcarlson01/gist:e59d891ad799b8a5f009abad2076c88c

## gistfile1.txt
import timeit
from statistics import mean, stdev
import math
import matplotlib.pyplot as plt
import re2
from flashtext import KeywordProcessor
import ahocorasick
import re
from scipy import stats
import time

def run_re(keywords, text, pattern):
    matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
    return len(matches)

def run_re2(keywords, text, pattern):
    matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
    return len(matches)

def run_flashtext(keywords, text, processor):
    matches = [(match) for match in processor.extract_keywords(text,span_info=True)]
    return len(matches)

def run_pyahocorasick(keywords, text, automaton):
    matches = [(match[0], match[0] + len(match[1])) for match in automaton.iter(text)]
    return len(matches)

def generate_summary(keywords, texts):
    methods = [
        ("re", run_re),
        ("re2", run_re2),
        ("flashtext", run_flashtext),
        ("pyahocorasick", run_pyahocorasick)
    ]
    summaries = {}
    for method_name, method_func in methods:
        times = []
        if method_name == 'flashtext':
            print('flash')
            processor = KeywordProcessor()
            for keyword in keywords:
                processor.add_keyword(keyword)
            method_cache = processor
        elif method_name == 'pyahocorasick':
            print('py')
            automaton = ahocorasick.Automaton()
            for keyword in keywords:
                automaton.add_word(keyword, keyword)
            automaton.make_automaton()
            method_cache = automaton
        elif method_name == 're':
            method_cache = re.compile('|'.join(keywords))
        elif method_name == 're2':
            method_cache = re2.compile('|'.join(keywords))
        else:
            print('no cache')
            method_cache = ''

        for text in texts:
            start_time = time.perf_counter()
            result = method_func(keywords, text, method_cache)
            end_time = time.perf_counter()
            elapsed_time = end_time - start_time
            times.append(elapsed_time)
        sum_time = sum(times)
        avg_time = mean(times)
        min_time = min(times)
        max_time = max(times)
        std_dev = stdev(times)
        confidence_interval = stats.t.interval(0.95, len(times)-1, loc=avg_time, scale=stats.sem(times))
        summaries[method_name] = {
            "sum_time" : sum_time,
            "average_time": avg_time,
            "min_time": min_time,
            "max_time": max_time,
            "std_dev": std_dev,
            "confidence_interval": confidence_interval
        }
        #print(f"\nSummary for method: {method_name}")
        #print(f"Total Time: {sum_time:.6f} seconds")
        #print(f"Average Time: {avg_time:.6f} seconds")
        #print(f"Minimum Time: {min_time:.6f} seconds")
        #print(f"Maximum Time: {max_time:.6f} seconds")
        #print(f"Standard Deviation: {std_dev:.6f}")
        #print(f"95% Confidence Interval: ({confidence_interval[0]:.6f}, {confidence_interval[1]:.6f})")
    return summaries

def plot_summary(summary):
    x_values = list(summary.keys())
    y_values = [data['average_time'] for data in summary.values()]

    fig, ax = plt.subplots()
    ax.bar(x_values, y_values)

    ax.set_ylabel('Average Time')
    ax.set_xlabel('Method')
    ax.set_title('Average Time of Finding a Keyword per Method')

    plt.show()

def plot_total_time(summary):
    x_values = list(summary.keys())
    y_values = [data['sum_time'] for data in summary.values()]

    fig, ax = plt.subplots()
    ax.bar(x_values, y_values)

    ax.set_ylabel('Total Time')
    ax.set_xlabel('Method')
    ax.set_title('Total Time Per Method')

    plt.show()

def find_fastest_method(summary):
    fastest_time = float('inf')
    fastest_method = ''
    for method, data in summary.items():
        avg_time = data['average_time']
        if avg_time < fastest_time:
            fastest_time = avg_time
            fastest_method = method
    print(f"\nThe fastest method is {fastest_method} with an average time of {fastest_time:.6f} seconds")
    for method, data in summary.items():
        if method != fastest_method:
            avg_time = data['average_time']
            speedup = (avg_time - fastest_time) / fastest_time * 100
            print(f"{method} is {speedup:.2f}% slower than {fastest_method}")

import matplotlib.pyplot as plt

def plot_avg_string_length(texts):
    avg_lengths = []
    for text in texts:
        avg_lengths.append(len(text))
    plt.plot(avg_lengths)
    plt.title('String Length over Message')
    plt.xlabel('Message')
    plt.ylabel('String Length')
    plt.show()

from prettytable import PrettyTable
from math import ceil

def generate_table(summary):
    table = PrettyTable()
     # Determine the fastest method
    fastest_method = min(summary.keys(), key=lambda x: summary[x]['average_time'])
    # Sort methods by average time
    sorted_methods = sorted(summary.keys(), key=lambda x: summary[x]['average_time'], reverse=True)

    table.field_names = ['Method', 'Total Time', 'Average Time', 'Min Time', 'Max Time', 'Std Dev', 'Confidence Interval']
    for i, method in enumerate(sorted_methods):
        rank = i + 1
        data = summary[method]
        sum_time = f"{round(data['sum_time'], 6)} sec"
        avg_time = f"{round(data['average_time'], 6)} sec"
        min_time = f"{round(data['min_time'], 6)} sec"
        max_time = f"{round(data['max_time'], 6)} sec"
        std_dev = f"{round(data['std_dev'], 6)} sec"
        conf_interval = data['confidence_interval']

        # Highlight the fastest method in the table
        if method == fastest_method:
            table.add_row([f"*{method}", sum_time, avg_time, min_time, max_time, std_dev, conf_interval])
        else:
            table.add_row([method, sum_time, avg_time, min_time, max_time, std_dev, conf_interval])

    print(table)

from lorem_text import lorem
from faker import Faker
import random
def main():
    keywords = ['d', 'ipsum', 'dolor', 'sit', 'amet']
    fake = Faker()
    keywords += [fake.word() for _ in range(100)]
    texts = [lorem.paragraph() for _ in range(100000)]
    texts.append(test)

    summary = generate_summary(keywords, texts)
    find_fastest_method(summary)
    plot_summary(summary)
    plot_total_time(summary)
    plot_avg_string_length(texts)
    generate_table(summary)
main()
	import timeit
	from statistics import mean, stdev
	import math
	import matplotlib.pyplot as plt
	import re2
	from flashtext import KeywordProcessor
	import ahocorasick
	import re
	from scipy import stats
	import time

	def run_re(keywords, text, pattern):
	matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
	return len(matches)

	def run_re2(keywords, text, pattern):
	matches = [(match.group(),match.start(), match.end()) for match in pattern.finditer(text)]
	return len(matches)

	def run_flashtext(keywords, text, processor):
	matches = [(match) for match in processor.extract_keywords(text,span_info=True)]
	return len(matches)

	def run_pyahocorasick(keywords, text, automaton):
	matches = [(match[0], match[0] + len(match[1])) for match in automaton.iter(text)]
	return len(matches)

	def generate_summary(keywords, texts):
	methods = [
	("re", run_re),
	("re2", run_re2),
	("flashtext", run_flashtext),
	("pyahocorasick", run_pyahocorasick)
	]
	summaries = {}
	for method_name, method_func in methods:
	times = []
	if method_name == 'flashtext':
	print('flash')
	processor = KeywordProcessor()
	for keyword in keywords:
	processor.add_keyword(keyword)
	method_cache = processor
	elif method_name == 'pyahocorasick':
	print('py')
	automaton = ahocorasick.Automaton()
	for keyword in keywords:
	automaton.add_word(keyword, keyword)
	automaton.make_automaton()
	method_cache = automaton
	elif method_name == 're':
	method_cache = re.compile('\|'.join(keywords))
	elif method_name == 're2':
	method_cache = re2.compile('\|'.join(keywords))
	else:
	print('no cache')
	method_cache = ''

	for text in texts:
	start_time = time.perf_counter()
	result = method_func(keywords, text, method_cache)
	end_time = time.perf_counter()
	elapsed_time = end_time - start_time
	times.append(elapsed_time)
	sum_time = sum(times)
	avg_time = mean(times)
	min_time = min(times)
	max_time = max(times)
	std_dev = stdev(times)
	confidence_interval = stats.t.interval(0.95, len(times)-1, loc=avg_time, scale=stats.sem(times))
	summaries[method_name] = {
	"sum_time" : sum_time,
	"average_time": avg_time,
	"min_time": min_time,
	"max_time": max_time,
	"std_dev": std_dev,
	"confidence_interval": confidence_interval
	}
	#print(f"\nSummary for method: {method_name}")
	#print(f"Total Time: {sum_time:.6f} seconds")
	#print(f"Average Time: {avg_time:.6f} seconds")
	#print(f"Minimum Time: {min_time:.6f} seconds")
	#print(f"Maximum Time: {max_time:.6f} seconds")
	#print(f"Standard Deviation: {std_dev:.6f}")
	#print(f"95% Confidence Interval: ({confidence_interval[0]:.6f}, {confidence_interval[1]:.6f})")
	return summaries

	def plot_summary(summary):
	x_values = list(summary.keys())
	y_values = [data['average_time'] for data in summary.values()]

	fig, ax = plt.subplots()
	ax.bar(x_values, y_values)

	ax.set_ylabel('Average Time')
	ax.set_xlabel('Method')
	ax.set_title('Average Time of Finding a Keyword per Method')

	plt.show()

	def plot_total_time(summary):
	x_values = list(summary.keys())
	y_values = [data['sum_time'] for data in summary.values()]

	fig, ax = plt.subplots()
	ax.bar(x_values, y_values)

	ax.set_ylabel('Total Time')
	ax.set_xlabel('Method')
	ax.set_title('Total Time Per Method')

	plt.show()

	def find_fastest_method(summary):
	fastest_time = float('inf')
	fastest_method = ''
	for method, data in summary.items():
	avg_time = data['average_time']
	if avg_time < fastest_time:
	fastest_time = avg_time
	fastest_method = method
	print(f"\nThe fastest method is {fastest_method} with an average time of {fastest_time:.6f} seconds")
	for method, data in summary.items():
	if method != fastest_method:
	avg_time = data['average_time']
	speedup = (avg_time - fastest_time) / fastest_time * 100
	print(f"{method} is {speedup:.2f}% slower than {fastest_method}")

	import matplotlib.pyplot as plt

	def plot_avg_string_length(texts):
	avg_lengths = []
	for text in texts:
	avg_lengths.append(len(text))
	plt.plot(avg_lengths)
	plt.title('String Length over Message')
	plt.xlabel('Message')
	plt.ylabel('String Length')
	plt.show()

	from prettytable import PrettyTable
	from math import ceil

	def generate_table(summary):
	table = PrettyTable()
	# Determine the fastest method
	fastest_method = min(summary.keys(), key=lambda x: summary[x]['average_time'])
	# Sort methods by average time
	sorted_methods = sorted(summary.keys(), key=lambda x: summary[x]['average_time'], reverse=True)

	table.field_names = ['Method', 'Total Time', 'Average Time', 'Min Time', 'Max Time', 'Std Dev', 'Confidence Interval']
	for i, method in enumerate(sorted_methods):
	rank = i + 1
	data = summary[method]
	sum_time = f"{round(data['sum_time'], 6)} sec"
	avg_time = f"{round(data['average_time'], 6)} sec"
	min_time = f"{round(data['min_time'], 6)} sec"
	max_time = f"{round(data['max_time'], 6)} sec"
	std_dev = f"{round(data['std_dev'], 6)} sec"
	conf_interval = data['confidence_interval']

	# Highlight the fastest method in the table
	if method == fastest_method:
	table.add_row([f"*{method}", sum_time, avg_time, min_time, max_time, std_dev, conf_interval])
	else:
	table.add_row([method, sum_time, avg_time, min_time, max_time, std_dev, conf_interval])

	print(table)

	from lorem_text import lorem
	from faker import Faker
	import random
	def main():
	keywords = ['d', 'ipsum', 'dolor', 'sit', 'amet']
	fake = Faker()
	keywords += [fake.word() for _ in range(100)]
	texts = [lorem.paragraph() for _ in range(100000)]
	texts.append(test)

	summary = generate_summary(keywords, texts)
	find_fastest_method(summary)
	plot_summary(summary)
	plot_total_time(summary)
	plot_avg_string_length(texts)
	generate_table(summary)
	main()