Skip to content

Instantly share code, notes, and snippets.

@utgarda
Forked from ufm/ahocorasik.erl
Created November 8, 2010 12:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save utgarda/667655 to your computer and use it in GitHub Desktop.
Save utgarda/667655 to your computer and use it in GitHub Desktop.
%%% File : ahocorasik.erl
%%% Author : Fyodor Ustinov <ufm@ufm.su>
%%% Descrip.: Multiple search based on Aho-Corasick string matching algorithm
%%%
%%% License : GPL
%%%
%%% Usage:
%%% ahocorasik:match(["pat1", "pat2"], "pat1 pat2 pat1")
%%% or
%%% Tree = ahocorasick:tree(["pat1", "pat2"])
%%% ahocorasick:match_tree(Tree, "pat1 pat2 pat1")
%%%
%%% functions 'match' and 'match_tree' return lists of matching patterns
%%% and their positions in the text:
%%% [{"pat1",[1,11]},{"pat2",[6]}]
%%%
%% TODO: mind the license. Either publish the changes,
%% (don't integrate this module much with the rest of the code then)
%% or re-implement, if this one proves unusable
-module(ahocorasick).
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-endif.
-export([match/2, match_tree/2, tree/1]).
-define(ROOT, {[], [], []}).
-define(FAILURE(State),element(3, State#state.apex)).
% label out set failure
-type apex() :: {list(), list(), list()}.
-record(state,{
apex = ?ROOT :: apex(),
tree = [] :: list(apex()),
counter = dict:new() :: dict(),
pos = 1 :: integer()
}
).
%% Building a trie from a pattern dictionary
tree(Patterns) ->
build_fail(build_tree(Patterns, []), 1)
.
build_tree([], Tree) -> [?ROOT|lists:reverse(Tree)];
build_tree([H|T], Tree) ->
build_tree(T, parse_word(H, Tree))
.
parse_word(Word, Tree) ->
parse_word(Word, Tree, [])
.
%TODO Note: here Apex is not of type apex() defined above. Deceiving, fix it.
parse_word([], Tree, ApexLabel) when length(ApexLabel) =:= 0 -> Tree;
parse_word([], Tree, ApexLabel) ->
Out = lists:reverse(ApexLabel),
add_out(ApexLabel, Out, Tree);
parse_word([H|T], Tree, ApexLabel) ->
CApex = [H|ApexLabel],
case lists:keymember(CApex, 1, Tree) of
true ->
parse_word(T, Tree, CApex);
false ->
parse_word(T, add_apex(CApex, Tree), CApex)
end.
add_apex(CApex, Tree) ->
[{CApex, [], []}| Tree].
add_out(Apex, Out, Tree) ->
{Apex, COut, Fail} = lists:keyfind(Apex, 1, Tree),
case lists:member(Out, COut) of
false ->
lists:keyreplace(Apex, 1, Tree, {Apex, lists:usort([Out | COut]), Fail});
true ->
Tree
end.
build_fail(Tree, Level) ->
Apexes = [X || {X, _, _} <- Tree, length(X) =:= Level],
case Apexes of
[] ->
Tree;
_ ->
build_fail(fail_apexes(Apexes, Tree), Level + 1)
end.
fail_apexes([], Tree) -> Tree;
fail_apexes([Apex|Apexes], Tree) ->
fail_apexes(Apexes, fail_one(Apex, lists:reverse(Apex), Tree)).
fail_one([_Apex|[]], _, Tree) -> Tree; % No failure functions for 1-character labels
fail_one(_, [], Tree) -> Tree;
fail_one(Apex, [_|SApex], Tree) ->
NApex = lists:reverse(SApex), % Is the point of [_|SApex] and lists:reverse(Apex) to just get a sublist? Is it faster or what?
case lists:keymember(NApex, 1, Tree) of
true ->
add_fail(Apex, NApex, Tree);
false ->
fail_one(Apex, SApex, Tree)
end.
add_fail(Apex, NApex, Tree) ->
{Apex, Out, _} = lists:keyfind(Apex, 1, Tree),
{NApex, COut, _} = lists:keyfind(NApex, 1, Tree),
lists:keyreplace(Apex, 1, Tree, {Apex, lists:umerge(COut, Out), NApex}).
%Maybe using ordsets:union/2 would be more readable than just lists:umerge/2
%% Searching for patterns with a pre-built trie
match_tree(Tree, Text) -> do_match(#state{tree = Tree}, Text).
match(Patterns, Text) -> do_match(#state{tree = tree(Patterns)}, Text).
do_match(S, []) -> dict:to_list(S#state.counter);
do_match(S, [H|T]) ->
S1 = do_apex(S, H),
S2 = S1#state{pos = S1#state.pos + 1},
do_match(S2, T).
do_apex(S, H) ->
Apex = [H | element(1, S#state.apex)], % adding a character to current apex label
% performing keyfind on all nodes, not just children of the current apex,
% which may be not so good for big automatons
% TODO: consider keeping links to subnodes
case lists:keyfind(Apex, 1, S#state.tree) of
{Apex, Out, _Fail} = NApex ->
(inc_out(S, Out))#state{apex = NApex};
false ->
case length(Apex) of
1 ->
S#state{apex = ?ROOT};
_ -> % Getting failure function ↓ isn't very readable
NextApex = lists:keyfind(?FAILURE(S),1, S#state.tree),
S1 = S#state{apex = NextApex},
do_apex(S1, H)
% TODO: change type apex() to record?
end
end.
inc_out(S, []) -> S;
inc_out(S, [H|T]) ->
inc_out(S#state{counter = dict:append(H, S#state.pos - length(H) + 1, S#state.counter)}, T).
%%%%%%%%%%%%%%%%%%%% Tests %%%%%%%%%%%%%%%%%%%%%%%%%
-ifdef(TEST).
tree_test() ->
?assertEqual(tree(["1"]), [{[],[],[]},{"1",["1"],[]}]),
T = tree(["1","2"]),
?assertEqual(T, [{[],[],[]},{"1",["1"],[]},{"2",["2"],[]}]),
R = match(["1","2"],"123456789012345678901234567890"),
?assertEqual(R, [{"2",[2,12,22]},{"1",[1,11,21]}]),
?assertEqual(match_tree(T,"123456789012345678901234567890"), R),
?assertEqual(match(["1","12","2","23","901"],"123456789012345678901234567890"), [
{"23",[2,12,22]},
{"2",[2,12,22]},
{"901",[9,19]},
{"1",[1,11,21]},
{"12",[1,11,21]}]
).
-endif.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment