Skip to content

Instantly share code, notes, and snippets.

@justinkamerman
Created March 22, 2012 15:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save justinkamerman/2158931 to your computer and use it in GitHub Desktop.
Save justinkamerman/2158931 to your computer and use it in GitHub Desktop.
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;
import util.*;
public class AhoIndexer extends DocumentTask
{
private static Logger log = Logger.getLogger (AhoIndexer.class.getName());
private StateMachine __sm;
public AhoIndexer (StateMachine sm, Document document, List<Match> matches)
{
super (document, matches);
__sm = sm;
}
/**
* Index a document and return match result
*/
public Match work (Document document)
{
log.fine ("Processing document " + document.getName());
Match match = new Match (document);
ExecutionContext ctx = new ExecutionContext (__sm);
try
{
InputStreamReader isr = new InputStreamReader (new FileInputStream (document.getFile()), "US-ASCII");
BufferedReader br = new BufferedReader (isr);
int b;
while ((b = br.read()) != -1)
{
char a = (char) Character.toLowerCase(b);
Set<String> output = ctx.goTo (a);
log.finest (String.format("--%c--> %d", a, ctx.getState().getId()));
// If there is any output from the target state, add it to the document match
if ( output != null )
{
match.addOccurrence (output, ctx.getPosition());
}
}
br.close();
}
catch (FileNotFoundException ex)
{
log.severe ("File not found: " + document.getName() + ": " + ex.toString());
}
catch (IOException ex)
{
log.severe ("IO Exception reading document: " + ex.toString());
}
return match;
}
}
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;
import util.*;
public class AhoSearcher extends DocumentTask
{
private static Logger log = Logger.getLogger (AhoSearcher.class.getName());
private StateMachine __sm;
private Set<String> __searchWords;
public AhoSearcher (StateMachine sm, Document document, List<Match> matches, Set<String> searchWords)
{
super (document, matches);
__sm = sm;
__searchWords = searchWords;
}
/**
* Returns true if the given document contains at least one
* occurrence of each keyword.
*/
public Match work (Document document)
{
log.fine ("Processing document " + document.getName());
Match match = new Match (document);
ExecutionContext ctx = new ExecutionContext (__sm);
try
{
InputStreamReader isr = new InputStreamReader (new FileInputStream (document.getFile()), "US-ASCII");
BufferedReader br = new BufferedReader (isr);
int b;
while ((b = br.read()) != -1)
{
char a = (char) Character.toLowerCase(b);
Set<String> output = ctx.goTo (a);
log.finest (String.format("--%c--> %d", a, ctx.getState().getId()));
// If there is any output from the target state, check
// it against the search word list
if ( output != null )
{
for (String matchword : output)
{
if (__searchWords.contains (matchword))
{
match.addOccurrence (matchword, ctx.getPosition());
if ( match.size() == __searchWords.size() ) return match;
}
}
// Check if we've found each search word
if (__searchWords.size() == 0) return match;
}
}
br.close ();
}
catch (FileNotFoundException ex)
{
log.severe ("File not found: " + document.getName() + ": " + ex.toString());
}
catch (IOException ex)
{
log.severe ("IO Exception reading document: " + ex.toString());
}
return null;
}
}
import java.io.DataInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;
import util.*;
/**
* This class externalizes an execution path through an Aho state
* machine so that a single machine can be used simultaneously by
* multiple threads.
*/
public class ExecutionContext
{
private static Logger log = Logger.getLogger (ExecutionContext.class.getName());
private StateMachine __stateMachine;
private State __state;
private int __position;
/**
* Create execution context which can be used by a thread to
* navigate the state machine without effecting other threads
* doing the same.
*/
private ExecutionContext () {};
public ExecutionContext (StateMachine stateMachine)
{
__stateMachine = stateMachine;
__state = __stateMachine.getStartState();
__position = 0;
}
/**
* Return current state
*/
public State getState ()
{
return __state;
}
/**
* Get current document position
*/
public int getPosition ()
{
return __position;
}
/**
* Advance to the next state, following failure paths if necessary.
*/
public Set<String> goTo (Character a)
{
// Aho75: algorithm 1
// Follow failure paths until we hit a state that has an edge
// matching the input character
while ( __state.goTo (a) == null ) __state = __state.getFailure ();
__state = __state.goTo (a);
__position++;
return __state.getOutput ();
}
}
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.logging.Logger;
public class State implements Iterable<State>
{
private static Logger log = Logger.getLogger (State.class.getName());
private int __id = 0;
private HashMap<Character, State> __transitions = new HashMap<Character, State> ();
private HashSet<String> __output = new HashSet<String>();
private State __failure;
private State () {}
public State (int id)
{
__id = id;
}
public int getId ()
{
return __id;
}
public HashMap<Character, State> getTransitions ()
{
return __transitions;
}
/**
* Create a new state and add a transition to it from this state
* with the given label. Return a reference to the new state (convenience)
*
* NOTE: all transitions must be added before the State is actually used.
*/
public State addTransition (Character a, State s)
{
__transitions.put (a, s);
return s;
}
public Set<String> getOutput ()
{
return __output;
}
public void addOutput (String output)
{
if (output != null)
{
__output.add (output);
}
else
{
log.warning ("Adding null output string");
}
}
public void mergeOutput (Set<String> output)
{
__output.addAll (output);
}
public void setFailure (State s)
{
__failure = s;
}
public State getFailure ()
{
return __failure;
}
public boolean isStart ()
{
return __id == 0;
}
/**
* Return state that we would transition to on input a. Returns
* null if there is no transition for the input character
* i.e. does not follow failure paths.
*/
public State goTo (Character a)
{
State target = __transitions.get (a);
if (target == null && this.isStart())
{
// Start state loops
return this;
}
else
{
return target;
}
}
/**
* Iterable interface method
*/
public Iterator<State> iterator ()
{
return new StateIterator (this);
}
public String toString ()
{
StringBuffer sb = new StringBuffer ();
sb.append (String.format("[id=%s][failure=%s][transitions[",
__id, __failure == null ? "" : __failure.getId()));
for ( Character c : __transitions.keySet() )
{
sb.append ("[" + c.toString() + " -> " + __transitions.get(c).getId() + "]");
}
sb.append ("]");
return sb.toString ();
}
}
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Stack;
import java.util.logging.Logger;
public class StateIterator implements Iterator<State>
{
private static Logger log = Logger.getLogger (StateIterator.class.getName());
private Stack<State> __stack = new Stack<State>();
private ArrayList<State> __visited = new ArrayList<State>();
private StateIterator () {}
public StateIterator (State state)
{
__stack.push (state);
}
public boolean hasNext ()
{
return (__stack.size() != 0);
}
public State next ()
{
State current = __stack.pop ();
for ( State child : current.getTransitions().values() )
{
// Put state on the stack if we haven't visited already
if ( ! __visited.contains (child) )
{
__stack.push (child);
}
}
return current;
}
public void remove ()
{
throw new UnsupportedOperationException ();
}
}
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.logging.Logger;
public class StateMachine
{
private static Logger log = Logger.getLogger (StateMachine.class.getName());
private static int __stateIdSequence = 0;
private State __startState;
private StateMachine () {}
public StateMachine (List<String> keywords)
{
__startState = new State (nextStateId());
// Construct goto function (Aho75 alogorithm 2)
for (String keyword : keywords )
{
log.finest ("Adding keyword: " + keyword);
enter (keyword);
}
log.finest ("Added " + keywords.size() + " keywords");
// Construct failure function (Aho75 Algorithm 3)
constructFailureFunction ();
// Eliminate failure transitions - Aho75 algorithm 4
}
public State getStartState ()
{
return __startState;
}
private static int nextStateId ()
{
return __stateIdSequence++;
}
/**
* Add pathways for a new keyword
*/
private void enter (String keyword)
{
State s = __startState;
int j = 0;
// Follow existing path as far as possible, don't use goTo()
// because it has pseudo loops on start state
for (j = 0; j < keyword.length(); j++)
{
if (s.getTransitions().get(keyword.charAt(j)) == null) break;
s = s.getTransitions().get(keyword.charAt(j));
}
// Extend path
for (int p = j; p < keyword.length(); p++)
{
State newState = new State (nextStateId());
s = s.addTransition (keyword.charAt(j++), newState);
}
s.addOutput (keyword);
}
/**
* Construct failure function
*/
private void constructFailureFunction ()
{
Queue<State> queue = new LinkedList<State>();
for (Character a : __startState.getTransitions().keySet())
{
State s = __startState.goTo(a);
log.finest ("Constructing failure function for level one state " + s.getId());
if ( s.getId() != 0 )
{
s.setFailure (__startState);
queue.add (s);
}
}
while ( queue.size() != 0 )
{
State r = queue.remove ();
for (Character a : r.getTransitions().keySet())
{
State s = r.getTransitions().get(a);
queue.add (s);
State state = r.getFailure ();
// Special condition for start state because we didn't
// add failure edges for this state
while (state.goTo(a) == null && ! state.isStart())
{
state = state.getFailure();
}
s.setFailure (state.goTo(a));
s.mergeOutput (s.getFailure().getOutput());
}
}
}
public String dot ()
{
StringBuffer sb = new StringBuffer ();
State s;
sb.append ("digraph G {\n");
Iterator<State> iter = __startState.iterator ();
while (iter.hasNext())
{
s = iter.next ();
sb.append (String.format ("\t%s [label=\"%s\", shape=circle];\n",
s.getId(),
s.getId() + " " + s.getOutput()));
// Goto edges
for (Character a : s.getTransitions().keySet())
{
sb.append (String.format ("\t%s -> %s [label=\"%s\"];\n",
s.getId(),
s.getTransitions().get(a).getId(),
a));
}
// Failure function
if ( s.getFailure() != null)
{
sb.append (String.format ("\t%s -> %s [color=\"red\"];\n",
s.getId(),
s.getFailure().getId()));
}
}
sb.append ("}");
return sb.toString ();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment