Created
November 29, 2017 23:21
-
-
Save raypendergraph/398877883ed408ab105a503787a2d257 to your computer and use it in GitHub Desktop.
Algorithm Snippets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: ascii -*- | |
# A pythonic implementation of the Aho-Corasick string searching algorithm which (I hope | |
# you will find) needs no documentation if you are reading the white paper along-side. | |
# https://en.wikipedia.org/wiki/Aho?Corasick_algorithm | |
from collections import deque, namedtuple | |
from typing import Iterable, Dict, List, Callable, Optional, Set, Generator | |
FAILURE = () | |
OutputData = List[List[str]] | |
OutputFunction = Callable[[int], List[str]] | |
TransitionData = List[Dict[str, int]] | |
FailureTransitions = List[int] | |
FailureFunction = Callable[[int], int] | |
GotoFunction = Callable[[int, str], int] | |
MatchData = namedtuple('MatchData',['keyword', 'start_index', 'end_index']) | |
def build_goto_and_initial_output(keywords: Iterable[str]) -> (TransitionData, OutputData): | |
goto_states = [dict()] | |
tmp_o = dict() | |
for keyword in (k.lower() for k in keywords): | |
state = 0 | |
valid_transitions = goto_states[state] | |
for i, character in enumerate(keyword): | |
state = valid_transitions.get(character, FAILURE) | |
if state is FAILURE: | |
state = len(goto_states) | |
goto_states.append(dict()) | |
valid_transitions[character] = state | |
valid_transitions = goto_states[state] | |
tmp_o.setdefault(state, []).append(keyword) | |
return goto_states, [tmp_o.get(s, []) for s in range(len(goto_states))] | |
def build_failure_and_final_outputs(goto_states: TransitionData, | |
output_states: OutputData) -> (FailureTransitions, | |
OutputData): | |
failure_transitions = [0] * len(goto_states) | |
updated_output = output_states.copy() | |
queue = deque(goto_states[0].values()) | |
for initial_state in queue: | |
failure_transitions[initial_state] = 0 | |
while queue: | |
r = queue.popleft() | |
for a, s in goto_states[r].items(): | |
queue.append(s) | |
state = failure_transitions[r] | |
while goto_states[state].get(a, FAILURE) is FAILURE and state != 0: | |
state = failure_transitions[state] | |
failure_transitions[s] = goto_states[state].get(a, 0) | |
updated_output[s] = updated_output[s] + updated_output[failure_transitions[s]] | |
return failure_transitions, updated_output | |
def build_functions(keywords: List[str]): | |
goto_states, initial_outputs = build_goto_and_initial_output(keywords) | |
failure_states, final_outputs = build_failure_and_final_outputs(goto_states, | |
initial_outputs) | |
def goto(_state: int, a: str)->Optional[int]: | |
return goto_states[_state].get(a, None) | |
def output(_state: int)->List[str]: | |
return final_outputs[_state] | |
def failure(_state: int)->int: | |
return failure_states[_state] | |
return goto, output, failure | |
def generate_matches(string: str, | |
goto_function: GotoFunction, | |
failure_function: FailureFunction, | |
output_function: OutputFunction) -> Generator[MatchData, None, None]: | |
current_state = 0 | |
for i, character in enumerate(string.lower()): | |
while goto_function(current_state, character) is None: | |
if current_state == 0: | |
break | |
current_state = failure_function(current_state) | |
current_state = goto_function(current_state, character) | |
if current_state is None: | |
current_state = 0 | |
else: | |
for keyword in output_function(current_state): | |
yield MatchData(keyword, i - len(keyword), i) | |
def find_matches(keywords: List[str], line: str) -> Iterable[MatchData]: | |
goto, output, failure = build_functions(keywords) | |
return generate_matches(line, goto, failure, output) | |
def find_matched_words(keywords: List[str], line: str) -> Set[str]: | |
goto, output, failure = build_functions(keywords) | |
return set([m.keyword for m in generate_matches(line, goto, failure, output)]) | |
if __name__ == '__main__': | |
keywords = ['met', 'uis','duis','lab', 'ng'] | |
input = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.' | |
keywords, starts, stops = tuple(zip(*list(find_matches(keywords, input)))) | |
print(input) | |
match_string = [' '] * len(input) | |
for beginning, end in zip(starts, stops): | |
match_string[beginning+1:end] = ['^'] * (end - beginning) | |
print(''.join(match_string)) | |
print(find_matched_words(keywords, input)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment