IevaZarina/aho-corasick-automata.py

## aho-corasick-automata.py
import ahocorasick as ahc

keywords = [
    ('he', 1),
    ('she', 1),
    ('hers', 1),
    ('her', 1)
]

text = [
    'he is here',
    'this is she',
    'this is hers ',
    'her bag is big'
]


def make_aho_automaton(keywords):
    A = ahc.Automaton()  # initialize
    for (key, cat) in keywords:
        A.add_word(key, (cat, key))  # add keys and categories to the trie struncture
    A.make_automaton()  # generate automaton
    return A


A = make_aho_automaton(keywords)


def find_keywords(line, A):
    found_keywords = []
    for end_index, (cat, keyw) in A.iter(line):
        found_keywords.append(keyw)
    return found_keywords


print '------ no padding ---------'

new_text = []
for line in text:
    print line, ':', find_keywords(line, A)

print '------ with padding --------'

keywords = [
    (' he ', 1),
    (' she ', 1),
    (' hers ', 1),
    (' her ', 1)
]

text = [
    ' he is here ',
    ' this is she ',
    ' this is hers ',
    ' her bag is big '
]

A_spaces = make_aho_automaton(keywords)

for line in text:
    print line, ':', find_keywords(line, A_spaces)

print '------ repalacing/removing found keywords ---------'


def find_keyword_locations(line, A):
    line_indices = [False for x in line]
    for end_index, (cat, keyw) in A.iter(line):
        start_index = end_index - len(keyw) + 2  # start index after first space
        for i in range(start_index, end_index):  # end index excluding last space
            line_indices[i] = True
    return line_indices


new_text_removed = []
new_text_replaced = []
for line in text:
    line_indices = find_keyword_locations(line, A_spaces)
    line = list(line)  # split string into list
    new_line = "".join([line[i] if not x else '' for i, x in enumerate(line_indices)])
    new_text_removed.append(new_line)
    new_line = "".join([line[i] if not x else '-' for i, x in enumerate(line_indices)])
    new_text_replaced.append(new_line)

print text
print new_text_removed
print new_text_replaced
	import ahocorasick as ahc

	keywords = [
	('he', 1),
	('she', 1),
	('hers', 1),
	('her', 1)
	]

	text = [
	'he is here',
	'this is she',
	'this is hers ',
	'her bag is big'
	]


	def make_aho_automaton(keywords):
	A = ahc.Automaton() # initialize
	for (key, cat) in keywords:
	A.add_word(key, (cat, key)) # add keys and categories to the trie struncture
	A.make_automaton() # generate automaton
	return A


	A = make_aho_automaton(keywords)


	def find_keywords(line, A):
	found_keywords = []
	for end_index, (cat, keyw) in A.iter(line):
	found_keywords.append(keyw)
	return found_keywords


	print '------ no padding ---------'

	new_text = []
	for line in text:
	print line, ':', find_keywords(line, A)

	print '------ with padding --------'

	keywords = [
	(' he ', 1),
	(' she ', 1),
	(' hers ', 1),
	(' her ', 1)
	]

	text = [
	' he is here ',
	' this is she ',
	' this is hers ',
	' her bag is big '
	]

	A_spaces = make_aho_automaton(keywords)

	for line in text:
	print line, ':', find_keywords(line, A_spaces)

	print '------ repalacing/removing found keywords ---------'


	def find_keyword_locations(line, A):
	line_indices = [False for x in line]
	for end_index, (cat, keyw) in A.iter(line):
	start_index = end_index - len(keyw) + 2 # start index after first space
	for i in range(start_index, end_index): # end index excluding last space
	line_indices[i] = True
	return line_indices


	new_text_removed = []
	new_text_replaced = []
	for line in text:
	line_indices = find_keyword_locations(line, A_spaces)
	line = list(line) # split string into list
	new_line = "".join([line[i] if not x else '' for i, x in enumerate(line_indices)])
	new_text_removed.append(new_line)
	new_line = "".join([line[i] if not x else '-' for i, x in enumerate(line_indices)])
	new_text_replaced.append(new_line)

	print text
	print new_text_removed
	print new_text_replaced