Simple test case for aho-corasick automata for keyword search.
import ahocorasick as ahc | |
keywords = [ | |
('he', 1), | |
('she', 1), | |
('hers', 1), | |
('her', 1) | |
] | |
text = [ | |
'he is here', | |
'this is she', | |
'this is hers ', | |
'her bag is big' | |
] | |
def make_aho_automaton(keywords): | |
A = ahc.Automaton() # initialize | |
for (key, cat) in keywords: | |
A.add_word(key, (cat, key)) # add keys and categories to the trie struncture | |
A.make_automaton() # generate automaton | |
return A | |
A = make_aho_automaton(keywords) | |
def find_keywords(line, A): | |
found_keywords = [] | |
for end_index, (cat, keyw) in A.iter(line): | |
found_keywords.append(keyw) | |
return found_keywords | |
print '------ no padding ---------' | |
new_text = [] | |
for line in text: | |
print line, ':', find_keywords(line, A) | |
print '------ with padding --------' | |
keywords = [ | |
(' he ', 1), | |
(' she ', 1), | |
(' hers ', 1), | |
(' her ', 1) | |
] | |
text = [ | |
' he is here ', | |
' this is she ', | |
' this is hers ', | |
' her bag is big ' | |
] | |
A_spaces = make_aho_automaton(keywords) | |
for line in text: | |
print line, ':', find_keywords(line, A_spaces) | |
print '------ repalacing/removing found keywords ---------' | |
def find_keyword_locations(line, A): | |
line_indices = [False for x in line] | |
for end_index, (cat, keyw) in A.iter(line): | |
start_index = end_index - len(keyw) + 2 # start index after first space | |
for i in range(start_index, end_index): # end index excluding last space | |
line_indices[i] = True | |
return line_indices | |
new_text_removed = [] | |
new_text_replaced = [] | |
for line in text: | |
line_indices = find_keyword_locations(line, A_spaces) | |
line = list(line) # split string into list | |
new_line = "".join([line[i] if not x else '' for i, x in enumerate(line_indices)]) | |
new_text_removed.append(new_line) | |
new_line = "".join([line[i] if not x else '-' for i, x in enumerate(line_indices)]) | |
new_text_replaced.append(new_line) | |
print text | |
print new_text_removed | |
print new_text_replaced |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment