Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Simple test case for aho-corasick automata for keyword search.
import ahocorasick as ahc
keywords = [
('he', 1),
('she', 1),
('hers', 1),
('her', 1)
]
text = [
'he is here',
'this is she',
'this is hers ',
'her bag is big'
]
def make_aho_automaton(keywords):
A = ahc.Automaton() # initialize
for (key, cat) in keywords:
A.add_word(key, (cat, key)) # add keys and categories to the trie struncture
A.make_automaton() # generate automaton
return A
A = make_aho_automaton(keywords)
def find_keywords(line, A):
found_keywords = []
for end_index, (cat, keyw) in A.iter(line):
found_keywords.append(keyw)
return found_keywords
print '------ no padding ---------'
new_text = []
for line in text:
print line, ':', find_keywords(line, A)
print '------ with padding --------'
keywords = [
(' he ', 1),
(' she ', 1),
(' hers ', 1),
(' her ', 1)
]
text = [
' he is here ',
' this is she ',
' this is hers ',
' her bag is big '
]
A_spaces = make_aho_automaton(keywords)
for line in text:
print line, ':', find_keywords(line, A_spaces)
print '------ repalacing/removing found keywords ---------'
def find_keyword_locations(line, A):
line_indices = [False for x in line]
for end_index, (cat, keyw) in A.iter(line):
start_index = end_index - len(keyw) + 2 # start index after first space
for i in range(start_index, end_index): # end index excluding last space
line_indices[i] = True
return line_indices
new_text_removed = []
new_text_replaced = []
for line in text:
line_indices = find_keyword_locations(line, A_spaces)
line = list(line) # split string into list
new_line = "".join([line[i] if not x else '' for i, x in enumerate(line_indices)])
new_text_removed.append(new_line)
new_line = "".join([line[i] if not x else '-' for i, x in enumerate(line_indices)])
new_text_replaced.append(new_line)
print text
print new_text_removed
print new_text_replaced
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment