Created
October 10, 2012 20:32
-
-
Save s1monw/3868238 to your computer and use it in GitHub Desktop.
readfloor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Enumerates all paths in the automaton that also intersect the FST, | |
* accumulating the FST end node and output for each path. | |
*/ | |
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst, | |
final boolean supportRanges) throws IOException { | |
final List<Path<T>> queue = new ArrayList<Path<T>>(); | |
final List<Path<T>> endNodes = new ArrayList<Path<T>>(); | |
queue.add(new Path<T>(a.getInitialState(), fst | |
.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(), | |
new IntsRef())); | |
final FST.Arc<T> scratchArc = new FST.Arc<T>(); | |
final FST.BytesReader fstReader = fst.getBytesReader(0); | |
// System.out.println("fst/a intersect"); | |
int iter = 0; | |
while (queue.size() != 0) { | |
System.out.println(iter++); | |
final Path<T> path = queue.remove(queue.size() - 1); | |
// System.out.println(" cycle path=" + path); | |
if (path.state.isAccept()) { | |
endNodes.add(path); | |
} | |
IntsRef currentInput = path.input; | |
if (supportRanges) { | |
for (Transition t : path.state.getTransitions()) { | |
if (t.getMin() == t.getMax()) { | |
// System.out.println(" t=" + (char) t.getMin()); | |
final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(), | |
path.fstNode, scratchArc, fstReader); | |
if (nextArc != null) { | |
// System.out.println((char)t.getMin()); | |
// System.out.println("\t num: " + nextArc.numArcs); | |
// System.out.println(" fst matches"); | |
// Path continues: | |
IntsRef newInput = new IntsRef(currentInput.length + 1); | |
newInput.copyInts(currentInput); | |
newInput.ints[currentInput.length] = t.getMin(); | |
newInput.length = currentInput.length + 1; | |
// System.out.println("fixed: " +UnicodeUtil.newString(newInput.ints,newInput.offset, newInput.length)); | |
queue.add(new Path<T>(t.getDest(), new FST.Arc<T>() | |
.copyFrom(nextArc), fst.outputs | |
.add(path.output, nextArc.output), newInput)); | |
} | |
} else { | |
FST.Arc<T> nextArc = readFloor(t.getMin(),fst, | |
path.fstNode, scratchArc, fstReader); | |
do { | |
if (nextArc != null && nextArc.label <= t.getMax()) { | |
System.out.println((char)t.getMin() + " " + (char)t.getMax()); | |
System.out.println(nextArc.label); | |
// System.out.println(" fst matches"); | |
// Path continues: | |
assert nextArc.label <= t.getMax(); | |
assert nextArc.label >= t.getMin(); | |
System.out.println("\t num: " + nextArc.numArcs); | |
IntsRef newInput = new IntsRef(currentInput.length + 1); | |
newInput.copyInts(currentInput); | |
newInput.ints[currentInput.length] = nextArc.label; | |
newInput.length = currentInput.length + 1; | |
// System.out.println(UnicodeUtil.newString(newInput.ints,newInput.offset, newInput.length)); | |
queue.add(new Path<T>(t.getDest(), new FST.Arc<T>() | |
.copyFrom(nextArc), fst.outputs.add(path.output, | |
nextArc.output), newInput)); | |
int label = nextArc.label; | |
nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc, fstReader);; | |
assert nextArc == null || label < nextArc.label : "last: " + label + " next: " + nextArc.label; | |
} else { | |
break; | |
} | |
} while (true); | |
} | |
} | |
} else { | |
for (Transition t : path.state.getTransitions()) { | |
// TODO: we can fix this if necessary: | |
if (t.getMin() != t.getMax()) { | |
throw new IllegalStateException( | |
"can only handle Transitions that match one character"); | |
} | |
// System.out.println(" t=" + (char) t.getMin()); | |
final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(), | |
path.fstNode, scratchArc, fstReader); | |
if (nextArc != null) { | |
// System.out.println(" fst matches"); | |
// Path continues: | |
IntsRef newInput = new IntsRef(currentInput.length + 1); | |
newInput.copyInts(currentInput); | |
newInput.ints[currentInput.length] = t.getMin(); | |
newInput.length = currentInput.length + 1; | |
// System.out.println(UnicodeUtil.newString(newInput.ints,newInput.offset, newInput.length)); | |
queue.add(new Path<T>(t.getDest(), new FST.Arc<T>() | |
.copyFrom(nextArc), fst.outputs | |
.add(path.output, nextArc.output), newInput)); | |
} | |
} | |
} | |
} | |
// for (Path<T> path2 : endNodes) { | |
// System.out.println(UnicodeUtil.newString(path2.input.ints,path2.input.offset, path2.input.length)); | |
// } | |
return endNodes; | |
} | |
public static <T> Arc<T> readFloor(int labelToMatch, FST<T> fst, Arc<T> follow, | |
Arc<T> arc, BytesReader in) throws IOException { | |
if (labelToMatch == FST.END_LABEL) { | |
if (follow.isFinal()) { | |
if (follow.target <= 0) { | |
arc.flags = FST.BIT_LAST_ARC; | |
} else { | |
arc.flags = 0; | |
// NOTE: nextArc is a node (not an address!) in this case: | |
arc.nextArc = follow.target; | |
arc.node = follow.target; | |
} | |
arc.output = follow.nextFinalOutput; | |
arc.label = FST.END_LABEL; | |
return arc; | |
} else { | |
return null; | |
} | |
} | |
if (!FST.targetHasArcs(follow)) { | |
return null; | |
} | |
if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { | |
// Arcs are fixed array -- use binary search to find | |
// the target. | |
fst.readFirstTargetArc(follow, arc, in); | |
int low = arc.arcIdx; | |
int high = arc.numArcs - 1; | |
int mid = 0; | |
// System.out.println("do arc array low=" + low + " high=" + high + | |
// " targetLabel=" + targetLabel); | |
while (low <= high) { | |
mid = (low + high) >>> 1; | |
in.pos = arc.posArcsStart; | |
in.skip(arc.bytesPerArc * mid + 1); | |
final int midLabel = fst.readLabel(in); | |
final int cmp = midLabel - labelToMatch; | |
// System.out.println(" cycle low=" + low + " high=" + high + " mid=" + | |
// mid + " midLabel=" + midLabel + " cmp=" + cmp); | |
if (cmp < 0) { | |
low = mid + 1; | |
} else if (cmp > 0) { | |
high = mid - 1; | |
} else { | |
arc.arcIdx = mid-1; | |
return fst.readNextRealArc(arc, in); | |
} | |
} | |
if (low == arc.numArcs) { | |
// DEAD END! | |
return null; | |
} | |
arc.arcIdx = (low > high ? high : low)-1; | |
return fst.readNextRealArc(arc, in); | |
} | |
// Linear scan | |
fst.readFirstRealTargetArc(follow.target, arc, in); | |
while (true) { | |
// System.out.println(" non-bs cycle"); | |
// TODO: we should fix this code to not have to create | |
// object for the output of every arc we scan... only | |
// for the matching arc, if found | |
if (arc.label >= labelToMatch) { | |
// System.out.println(" found!"); | |
return arc; | |
} else if (arc.isLast()) { | |
return null; | |
} else { | |
fst.readNextRealArc(arc, in); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment