Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@atdt
Last active April 12, 2024 19:29
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save atdt/875e0dba6a15e3fa6018 to your computer and use it in GitHub Desktop.
Save atdt/875e0dba6a15e3fa6018 to your computer and use it in GitHub Desktop.
Aho-Corasick string matching in Python
# Python implementation of Aho-Corasick string matching
#
# Alfred V. Aho and Margaret J. Corasick, "Efficient string matching: an aid to
# bibliographic search", CACM, 18(6):333-340, June 1975.
#
# <http://xlinux.nist.gov/dads//HTML/ahoCorasick.html>
#
# Copyright (C) 2015 Ori Livneh <ori@wikimedia.org>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FAIL = -1
def aho_corasick(string, keywords):
transitions = {}
outputs = {}
fails = {}
new_state = 0
for keyword in keywords:
state = 0
for j, char in enumerate(keyword):
res = transitions.get((state, char), FAIL)
if res == FAIL:
break
state = res
for char in keyword[j:]:
new_state += 1
transitions[(state, char)] = new_state
state = new_state
outputs[state] = [keyword]
queue = []
for (from_state, char), to_state in transitions.items():
if from_state == 0 and to_state != 0:
queue.append(to_state)
fails[to_state] = 0
while queue:
r = queue.pop(0)
for (from_state, char), to_state in transitions.items():
if from_state == r:
queue.append(to_state)
state = fails[from_state]
while True:
res = transitions.get((state, char), state and FAIL)
if res != FAIL:
break
state = fails[state]
failure = transitions.get((state, char), state and FAIL)
fails[to_state] = failure
outputs.setdefault(to_state, []).extend(
outputs.get(failure, []))
state = 0
results = []
for i, char in enumerate(string):
while True:
res = transitions.get((state, char), state and FAIL)
if res != FAIL:
state = res
break
state = fails[state]
for match in outputs.get(state, ()):
pos = i - len(match) + 1
results.append((pos, match))
return results
@valentas-kurauskas
Copy link

aho_corasick("asdfasgergfergf", ["dfasger", "dfa", "gf"])
[(2, 'dfasger'), (9, 'gf'), (13, 'gf')]

why is (2,'df') missing here? Are there any assumptions on input order?

@duycuong87vn
Copy link

this implementation is not exactly.

@tejasvi
Copy link

tejasvi commented Oct 30, 2021

Can confirm @valentas-kurauskas

aho_corasick("asdfasgergfergf", ["dfasger", "dfa", "gf"])
[(2, 'dfasger'), (9, 'gf'), (13, 'gf')]

dfa is missing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment