-
-
Save utgarda/667655 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%% File : ahocorasik.erl | |
%%% Author : Fyodor Ustinov <ufm@ufm.su> | |
%%% Descrip.: Multiple search based on Aho-Corasick string matching algorithm | |
%%% | |
%%% License : GPL | |
%%% | |
%%% Usage: | |
%%% ahocorasik:match(["pat1", "pat2"], "pat1 pat2 pat1") | |
%%% or | |
%%% Tree = ahocorasick:tree(["pat1", "pat2"]) | |
%%% ahocorasick:match_tree(Tree, "pat1 pat2 pat1") | |
%%% | |
%%% functions 'match' and 'match_tree' return lists of matching patterns | |
%%% and their positions in the text: | |
%%% [{"pat1",[1,11]},{"pat2",[6]}] | |
%%% | |
%% TODO: mind the license. Either publish the changes, | |
%% (don't integrate this module much with the rest of the code then) | |
%% or re-implement, if this one proves unusable | |
-module(ahocorasick). | |
-ifdef(TEST). | |
-include_lib("eunit/include/eunit.hrl"). | |
-endif. | |
-export([match/2, match_tree/2, tree/1]). | |
-define(ROOT, {[], [], []}). | |
-define(FAILURE(State),element(3, State#state.apex)). | |
% label out set failure | |
-type apex() :: {list(), list(), list()}. | |
-record(state,{ | |
apex = ?ROOT :: apex(), | |
tree = [] :: list(apex()), | |
counter = dict:new() :: dict(), | |
pos = 1 :: integer() | |
} | |
). | |
%% Building a trie from a pattern dictionary | |
tree(Patterns) -> | |
build_fail(build_tree(Patterns, []), 1) | |
. | |
build_tree([], Tree) -> [?ROOT|lists:reverse(Tree)]; | |
build_tree([H|T], Tree) -> | |
build_tree(T, parse_word(H, Tree)) | |
. | |
parse_word(Word, Tree) -> | |
parse_word(Word, Tree, []) | |
. | |
%TODO Note: here Apex is not of type apex() defined above. Deceiving, fix it. | |
parse_word([], Tree, ApexLabel) when length(ApexLabel) =:= 0 -> Tree; | |
parse_word([], Tree, ApexLabel) -> | |
Out = lists:reverse(ApexLabel), | |
add_out(ApexLabel, Out, Tree); | |
parse_word([H|T], Tree, ApexLabel) -> | |
CApex = [H|ApexLabel], | |
case lists:keymember(CApex, 1, Tree) of | |
true -> | |
parse_word(T, Tree, CApex); | |
false -> | |
parse_word(T, add_apex(CApex, Tree), CApex) | |
end. | |
add_apex(CApex, Tree) -> | |
[{CApex, [], []}| Tree]. | |
add_out(Apex, Out, Tree) -> | |
{Apex, COut, Fail} = lists:keyfind(Apex, 1, Tree), | |
case lists:member(Out, COut) of | |
false -> | |
lists:keyreplace(Apex, 1, Tree, {Apex, lists:usort([Out | COut]), Fail}); | |
true -> | |
Tree | |
end. | |
build_fail(Tree, Level) -> | |
Apexes = [X || {X, _, _} <- Tree, length(X) =:= Level], | |
case Apexes of | |
[] -> | |
Tree; | |
_ -> | |
build_fail(fail_apexes(Apexes, Tree), Level + 1) | |
end. | |
fail_apexes([], Tree) -> Tree; | |
fail_apexes([Apex|Apexes], Tree) -> | |
fail_apexes(Apexes, fail_one(Apex, lists:reverse(Apex), Tree)). | |
fail_one([_Apex|[]], _, Tree) -> Tree; % No failure functions for 1-character labels | |
fail_one(_, [], Tree) -> Tree; | |
fail_one(Apex, [_|SApex], Tree) -> | |
NApex = lists:reverse(SApex), % Is the point of [_|SApex] and lists:reverse(Apex) to just get a sublist? Is it faster or what? | |
case lists:keymember(NApex, 1, Tree) of | |
true -> | |
add_fail(Apex, NApex, Tree); | |
false -> | |
fail_one(Apex, SApex, Tree) | |
end. | |
add_fail(Apex, NApex, Tree) -> | |
{Apex, Out, _} = lists:keyfind(Apex, 1, Tree), | |
{NApex, COut, _} = lists:keyfind(NApex, 1, Tree), | |
lists:keyreplace(Apex, 1, Tree, {Apex, lists:umerge(COut, Out), NApex}). | |
%Maybe using ordsets:union/2 would be more readable than just lists:umerge/2 | |
%% Searching for patterns with a pre-built trie | |
match_tree(Tree, Text) -> do_match(#state{tree = Tree}, Text). | |
match(Patterns, Text) -> do_match(#state{tree = tree(Patterns)}, Text). | |
do_match(S, []) -> dict:to_list(S#state.counter); | |
do_match(S, [H|T]) -> | |
S1 = do_apex(S, H), | |
S2 = S1#state{pos = S1#state.pos + 1}, | |
do_match(S2, T). | |
do_apex(S, H) -> | |
Apex = [H | element(1, S#state.apex)], % adding a character to current apex label | |
% performing keyfind on all nodes, not just children of the current apex, | |
% which may be not so good for big automatons | |
% TODO: consider keeping links to subnodes | |
case lists:keyfind(Apex, 1, S#state.tree) of | |
{Apex, Out, _Fail} = NApex -> | |
(inc_out(S, Out))#state{apex = NApex}; | |
false -> | |
case length(Apex) of | |
1 -> | |
S#state{apex = ?ROOT}; | |
_ -> % Getting failure function ↓ isn't very readable | |
NextApex = lists:keyfind(?FAILURE(S),1, S#state.tree), | |
S1 = S#state{apex = NextApex}, | |
do_apex(S1, H) | |
% TODO: change type apex() to record? | |
end | |
end. | |
inc_out(S, []) -> S; | |
inc_out(S, [H|T]) -> | |
inc_out(S#state{counter = dict:append(H, S#state.pos - length(H) + 1, S#state.counter)}, T). | |
%%%%%%%%%%%%%%%%%%%% Tests %%%%%%%%%%%%%%%%%%%%%%%%% | |
-ifdef(TEST). | |
tree_test() -> | |
?assertEqual(tree(["1"]), [{[],[],[]},{"1",["1"],[]}]), | |
T = tree(["1","2"]), | |
?assertEqual(T, [{[],[],[]},{"1",["1"],[]},{"2",["2"],[]}]), | |
R = match(["1","2"],"123456789012345678901234567890"), | |
?assertEqual(R, [{"2",[2,12,22]},{"1",[1,11,21]}]), | |
?assertEqual(match_tree(T,"123456789012345678901234567890"), R), | |
?assertEqual(match(["1","12","2","23","901"],"123456789012345678901234567890"), [ | |
{"23",[2,12,22]}, | |
{"2",[2,12,22]}, | |
{"901",[9,19]}, | |
{"1",[1,11,21]}, | |
{"12",[1,11,21]}] | |
). | |
-endif. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment