Skip to content

Instantly share code, notes, and snippets.

@Validark
Created July 18, 2021 12:31
Show Gist options
  • Save Validark/d493cfd1b3425c2e3073f5ccd08fbeb9 to your computer and use it in GitHub Desktop.
Save Validark/d493cfd1b3425c2e3073f5ccd08fbeb9 to your computer and use it in GitHub Desktop.
A clean implementation of the aho-corasick algorithm taking full advantage of Lua's __index metamethod.
-- We can re-use metatables where possible
local lookupCache = {
__index = function(self, i)
local v = { __index = i }
self[i] = v
return v
end
}
local function use_aho_corasick(automaton, str)
local found = {}
local state = automaton
for i = 1, string.len(str) do
state = state[string.lower(string.sub(str, i, i))] or automaton
for _, capture in ipairs(state.captures) do
table.insert(found, { i - capture + 1, i })
end
end
return found
end
local automaton_mt = { __call = use_aho_corasick }
-- given a dictionary Array<string>
-- output a DFA with metatable magic
local function aho_corasick(dictionary)
-- captures: Array<numbers> representing the length of matched strings
local automaton = { captures = {} }
-- construct a basic Trie for the dictionary
for _, word in ipairs(dictionary) do
local cur = automaton
for char in string.gmatch(word, ".") do
local next = cur[char]
if next == nil then
next = {}
cur[char] = next
end
cur = next
end
cur.captures = cur.captures or {}
table.insert(cur.captures, string.len(word))
end
-- Link each node to the longest suffix in its path
-- Do a BFS, linking each level with its runs
-- (BFS allows us to use our links as we go!)
local pointers = setmetatable({}, lookupCache)
local queue = { { {}, automaton } }
for _, data in ipairs(queue) do
for char, node in pairs(data[2]) do
if char ~= "captures" then
local state = data[1][char] or automaton
if #state.captures > 0 then
if node.captures == nil then node.captures = {} end
table.move(state.captures, 1, #state.captures, #node.captures + 1, node.captures)
end
table.insert(queue, { state, setmetatable(node, pointers[state]) })
end
end
end
return setmetatable(automaton, automaton_mt)
end
do -- a little demo
local automata = aho_corasick { "try", "to", "find", "all", "these", "words" }
local input = "Let's see if you can find any matching words in this sentence"
local captures = automata(input)
for _, capture in ipairs(captures) do
print(capture[1], string.sub(input, capture[1], capture[2]))
end
end
return aho_corasick
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment