%%% File : ahocorasik.erl %%% Author : Fyodor Ustinov %%% Descrip.: Multiple search based on Aho–Corasick string matching algorithm %%% %%% License : GPL %%% %%% Usage: %%% ahocorasik:match(["pat1", "pat2"], "pat1 pat2 pat1") %%% or %%% Tree = ahocorasik:tree(["pat1", "pat2"]) %%% ahocorasik:match_tree(Tree, "pat1 pat2 pat1") %%% %%% functions 'match' and 'match_tree' returns a list of matching patterns %%% and their position in the text: %%% [{"pat1",[1,11]},{"pat2",[6]}] %%% -module(ahocorasik). -export([match/2, match_tree/2, tree/1]). -define(ROOT, {[], [], []}). -type apex() :: {list(), list(), list()}. -record(s,{ apex = ?ROOT :: apex(), tree = [] :: list(apex()), counter = dict:new() :: dict(), pos = 1 :: integer() } ). tree(Pat) -> build_fail(build_tree(Pat, []), 1) . build_tree([], Tree) -> [?ROOT|Tree]; build_tree([H|T], Tree) -> build_tree(T, parse_word(H, Tree)) . parse_word(Word, Tree) -> parse_word(Word, Tree, []) . parse_word([], Tree, Apex) when length(Apex) =:= 0 -> Tree; parse_word([], Tree, Apex) -> Out = lists:reverse(Apex), add_out(Apex, Out, Tree) ; parse_word([H|T], Tree, Apex) -> CApex = [H|Apex], case lists:keymember(CApex, 1, Tree) of true -> parse_word(T, Tree, CApex); false -> parse_word(T, add_apex(CApex, Tree), CApex) end . add_apex(CApex, Tree) -> [{CApex, [], []}| Tree] . add_out(Apex, Out, Tree) -> {Apex, COut, Fail} = lists:keyfind(Apex, 1, Tree), case lists:member(Out, COut) of false -> lists:keyreplace(Apex, 1, Tree, {Apex, lists:usort([Out | COut]), Fail}); true -> Tree end . build_fail(Tree, Level) -> Apexes = [X || {X, _, _} <- Tree, length(X) =:= Level], case Apexes of [] -> Tree; _ -> build_fail(fail_apexes(Apexes, Tree), Level + 1) end . fail_apexes([], Tree) -> Tree; fail_apexes([Apex|Apexes], Tree) -> fail_apexes(Apexes, fail_one(Apex, lists:reverse(Apex), Tree)) . fail_one([_Apex|[]], _, Tree) -> Tree; fail_one(_, [], Tree) -> Tree; fail_one(Apex, [_|SApex], Tree) -> NApex = lists:reverse(SApex), case lists:keymember(NApex, 1, Tree) of true -> add_fail(Apex, NApex, Tree); false -> fail_one(Apex, SApex, Tree) end . add_fail(Apex, NApex, Tree) -> {Apex, Out, _} = lists:keyfind(Apex, 1, Tree), {NApex, COut, _} = lists:keyfind(NApex, 1, Tree), lists:keyreplace(Apex, 1, Tree, {Apex, lists:umerge(COut, Out), NApex}) . match_tree(Tree, Text) -> do_match(#s{tree = Tree}, Text). match(Pat, Text) -> do_match(#s{tree = tree(Pat)}, Text). do_match(S, []) -> dict:to_list(S#s.counter); do_match(S, [H|T]) -> do_match((do_apex(S, H))#s{pos = S#s.pos + 1}, T). do_apex(S, H) -> Apex = [H | element(1, S#s.apex)], case lists:keyfind(Apex, 1, S#s.tree) of {Apex, Out, _Fail} = NApex -> (inc_out(S, Out))#s{apex = NApex}; false -> case length(Apex) of 1 -> S#s{apex = ?ROOT}; _ -> do_apex(S#s{apex = lists:keyfind(element(3, S#s.apex), 1, S#s.tree)}, H) end end . inc_out(S, []) -> S; inc_out(S, [H|T]) -> inc_out(S#s{counter = dict:append(H, S#s.pos - length(H) + 1, S#s.counter)}, T) .