Skip to content

Instantly share code, notes, and snippets.

@edwinyzh
Forked from madhurjain/AhoCorasick.pas
Created December 9, 2020 14:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edwinyzh/e0baf5300ec184a9d095b09966b73815 to your computer and use it in GitHub Desktop.
Save edwinyzh/e0baf5300ec184a9d095b09966b73815 to your computer and use it in GitHub Desktop.
AhoCorasick Implementation in Delphi / Pascal
unit AhoCorasick;
interface
uses Classes, Generics.Collections;
type
ptrNode = ^TNode;
TNode = record
id: Cardinal; // Node Id for debugging
alpha: AnsiChar; // Char at Node
isFinal: Boolean; // 0: No; 1: Yes, it is a final node
nextNodes: TList; // Transition Nodes
failureNode: ptrNode; // Failure Node
depth: Cardinal; // Depth of Node
result: AnsiString; // Keyword at Node
end;
type TAhoCorasick = class
private
root: TNode;
public
constructor Create;
destructor Destroy; override;
procedure AddKeyword(keyword: AnsiString);
procedure SetFailureNodes;
procedure SearchKeywords(content: AnsiString);
procedure PrintTrie;
end;
implementation
uses Windows, System.SysUtils;
procedure TAhoCorasick.AddKeyword(keyword: AnsiString);
var
ch: AnsiChar;
currentNode, newNode, nextNode: ptrNode;
begin
currentNode := @root;
// Iterate over each character in the keyword
for ch in keyword do
begin
newNode := nil;
// Check if the char series already exists in the node chain
for nextNode in currentNode^.nextNodes do
begin
if ch = nextNode.alpha then
begin
newNode := nextNode;
break;
end;
end;
// If char doesn't exist, add it as a new node
if newNode = nil then
begin
New(newNode);
newNode^.alpha := ch;
newNode^.depth := currentNode^.depth + 1;
newNode^.nextNodes := TList.Create;
newNode^.failureNode := nil;
newNode^.isFinal := False;
currentNode^.nextNodes.Add(newNode);
end;
// Set current node to next
currentNode := newNode;
end;
// Done adding keyword. Set the last node of keyword as final.
currentNode^.isFinal := True;
// Add the keyword to node
currentNode^.result := keyword;
end;
procedure TAhoCorasick.SetFailureNodes;
var
Queue: TQueue<ptrNode>;
parentNode, currentNode, stateNode, tmpNode: ptrNode;
matchFound: Boolean;
begin
Queue := TQueue<ptrNode>.Create();
// Set failureNode of all node with depth = 1 to root
// Also add them to queue
for tmpNode in root.nextNodes do
begin
tmpNode^.failureNode := @root;
Queue.Enqueue(tmpNode);
end;
while Queue.Count > 0 do
begin
parentNode := Queue.Dequeue();
for currentNode in parentNode^.nextNodes do
begin
// Enqueue node to be proecessed
Queue.Enqueue(currentNode);
// Calculate failure function for node
// from failure function of parent node
stateNode := parentNode^.failureNode;
matchFound := False;
// Iterate to top until a match for goto function is found
while (matchFound = False) do
begin
// Check if char exist in stateNode transitions
for tmpNode in stateNode^.nextNodes do
begin
if tmpNode^.alpha = currentNode^.alpha then
begin
matchFound := True;
break;
end;
end;
if stateNode = @root then
begin
break;
end;
// Move to depth - 1
if not matchFound then
begin
stateNode := stateNode^.failureNode;
end;
end; // end while
// Matching state found in trie
if matchFound then
begin
currentNode^.failureNode := tmpNode;
end
else
begin
currentNode^.failureNode := @root;
end;
end;
end;
Queue.Free;
end;
function FindNextNode(currentNode: ptrNode; alpha: AnsiChar): ptrNode;
var
tmpNode: ptrNode;
begin
Result := nil;
// Search for node matching the char
for tmpNode in currentNode^.nextNodes do
begin
// If found, return;
if alpha = tmpNode^.alpha then
begin
Result := tmpNode;
break;
end;
end;
end;
procedure TAhoCorasick.SearchKeywords(content: AnsiString);
var
c: AnsiChar;
currentNode, nextNode: ptrNode;
position: Cardinal;
begin
currentNode := @root;
position := 1;
// Iterate over the complete text
while position <= Length(content) do
begin
c := content[position];
nextNode := FindNextNode(currentNode, c);
// If not found, set current node to failure node
if nextNode = nil then
begin
// Increment only if node is root
if currentNode = @root then
begin
Inc(position);
end
else
begin
currentNode := currentNode^.failureNode;
end;
end
else
begin
currentNode := nextNode;
Inc(position);
end;
if currentNode^.isFinal then
begin
OutputDebugStringA(PAnsiChar(currentNode^.result));
// Also add the output of its failure node, if it is final
if currentNode^.failureNode^.isFinal then
begin
currentNode := currentNode^.failureNode;
OutputDebugStringA(PAnsiChar(currentNode^.result));
end;
end;
end;
end;
procedure TAhoCorasick.PrintTrie;
var
Queue: TQueue<ptrNode>;
parentNode, currentNode: ptrNode;
dbgInfo: AnsiString;
begin
Queue := TQueue<ptrNode>.Create();
Queue.Enqueue(@root);
while Queue.Count > 0 do
begin
parentNode := Queue.Dequeue();
for currentNode in parentNode^.nextNodes do
begin
// Enqueue next nodes
Queue.Enqueue(currentNode);
dbgInfo := currentNode^.alpha + ', ' + IntToStr(currentNode^.depth) + ' -> ' +
currentNode^.failureNode^.alpha + ', ' + IntToStr(currentNode^.failureNode^.depth);
OutputDebugStringA(PAnsiChar(dbgInfo));
end;
end;
end;
constructor TAhoCorasick.Create;
begin
// Set root node
root.id := 0;
root.alpha := #0;
root.failureNode := @root;
root.nextNodes := TList.Create;
root.depth := 0;
root.isFinal := False;
end;
destructor TAhoCorasick.Destroy;
var
Stack: TStack<ptrNode>;
parentNode, currentNode: ptrNode;
begin
Stack := TStack<ptrNode>.Create();
Stack.Push(@root);
while Stack.Count > 0 do
begin
parentNode := Stack.Pop;
for currentNode in parentNode^.nextNodes do
begin
Stack.Push(currentNode);
end;
if Assigned(parentNode) and (parentNode <> @root) then
begin
parentNode^.nextNodes.Free;
Dispose(parentNode);
end;
end;
root.nextNodes.Free;
Stack.Free;
inherited Destroy;
end;
end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment