Skip to content

Instantly share code, notes, and snippets.

@retnuh
Last active July 5, 2016 15:24
Show Gist options
  • Save retnuh/a58b1d03e5ebd927861a5c2e64dfce3c to your computer and use it in GitHub Desktop.
Save retnuh/a58b1d03e5ebd927861a5c2e64dfce3c to your computer and use it in GitHub Desktop.
TokenFSM - a simple token-based FSM for efficient token matching from a lexicon, etc.
package tokenfsm
import scala.annotation.tailrec
import scala.collection.immutable.SortedSet
/**
* Created by retnuh on 04/07/2016.
*/
object TokenFSM {
private implicit def lengthOrdering[T](implicit ordering: Ordering[T]) = new Ordering[Seq[T]] {
override def compare(x: Seq[T], y: Seq[T]): Int = {
val lc = x.lengthCompare(y.length)
if (lc == 0) {
val fc = ordering.compare(x.head, y.head)
if (fc == 0)
return compare(x.tail, y.tail)
return fc
}
return lc
}
}
def apply[T](states: Seq[Seq[T]])(implicit ordering: Ordering[T]) = {
val initialStates = states.foldLeft(Map.empty[T, SortedSet[Seq[T]]])((m, ts) => {
m + (ts.head -> (m.getOrElse(ts.head, SortedSet.empty[Seq[T]](lengthOrdering)) + ts))
})
new TokenFSM(initialStates)
}
}
class TokenFSM[T](initialStates: Map[T, SortedSet[Seq[T]]])(implicit ordering: Ordering[T]) {
import TokenFSM._
@tailrec
private def matchEach(tokens: Seq[T], pos: Int, inProgress: MachineState[T]): Seq[(Seq[T], (Int, Int))] = {
if (tokens.isEmpty) {
println(inProgress)
println(inProgress.complete)
return inProgress.complete.collect({ case Matched(t, s, e) => (t, (s, e)) })
}
println("=================")
val head = tokens.head
println(s"head: $head")
val dominant: MachineState[T] = initialStates.getOrElse(head, SortedSet.empty[Seq[T]](lengthOrdering))
.foldRight(inProgress)((ts: Seq[T], dominant: MachineState[T]) => dominant.dominate(Matching(ts, 0, pos)))
println(s"dominant before update: $dominant")
println("------------------")
val updated: MachineState[T] = dominant.nextToken(head, pos)
println(s"updated: $updated")
matchEach(tokens.tail, pos + 1, updated)
}
def matches(tokens: Seq[T]): Seq[(Seq[T], (Int, Int))] = {
matchEach(tokens, 0, NonMatch[T]())
}
}
sealed trait MachineState[T] {
def nextToken(token: T, offset: Int): MachineState[T]
def complete: Seq[MachineState[T]]
def dominate(subject: MachineState[T]): MachineState[T] = (this, subject) match {
case (NonMatch(), s) => s
case (t, NonMatch()) => t
case (DominatingMatching(d, os), ns) => println(s"$d already dominates $os now dominating $ns"); DominatingMatching(d, os.dominate(ns))
case (t, s) => println(s"$t dominating $s"); DominatingMatching(t, s)
}
}
trait InProgress[T] extends MachineState[T]
case class DominatingMatching[T](dominant: MachineState[T], subjugated: MachineState[T]) extends InProgress[T] {
override def nextToken(token: T, offset: Int): MachineState[T] = {
val d = dominant.nextToken(token, offset)
val subj = subjugated.nextToken(token, offset)
d match {
// If we don't match, just fallback to our subjugated match
case NonMatch() => subj
// If the dominant has matched, it overrides any subjects that have matched and have <= end.
// We have to keep going, though, in cases where a subjugated match fails at a later point in time and returns
// a match that we dominate. So we just keep going until complete.
case _ => DominatingMatching(d, subj)
}
}
override def complete: Seq[MachineState[T]] = {
val d = dominant.complete
val subjs = subjugated.complete
d match {
// If the dominant has matched, it overrides any subjects that have matched and have <= end.
// We have to keep going, though, in cases where a subjugated match fails at a later point in time and returns
// a match that we dominate. So we just keep going until complete.
case Seq(x: Matched[T]) => x +: subjs.collect({ case s: Matched[T] if s.end > x.end => s })
// If we haven't matched, just fallback to our subjugated match
case _ => subjs
}
}
}
case class Matching[T](tokens: Seq[T], pos: Int, start: Int) extends InProgress[T] {
override def nextToken(token: T, offset: Int): MachineState[T] = {
val nextPos = pos + 1
if (tokens(pos) == token) {
if (nextPos == tokens.length)
Matched(tokens, start, offset)
else
Matching(tokens, pos + 1, start)
} else {
NonMatch[T]()
}
}
override def complete: Seq[MachineState[T]] = Seq.empty
}
case class Matched[T](tokens: Seq[T], start: Int, end: Int) extends MachineState[T] {
override def nextToken(token: T, offset: Int): MachineState[T] = this
override def complete: Seq[MachineState[T]] = Seq(this)
}
case class NonMatch[T]() extends MachineState[T] {
override def nextToken(token: T, offset: Int): MachineState[T] = this
override def complete: Seq[MachineState[T]] = Seq.empty
}
package tokenfsm
import org.scalatest.Matchers
/**
* Created by retnuh on 04/07/2016.
*/
class TokenFSMSpecs extends org.scalatest.FlatSpec with Matchers {
val ws = """\s+""".r
def split(s: String) = ws.pattern.split(s).to[Seq]
val sentence = split("the quick brown fox jumped over the lazy dog")
"A TokenFSM" should "match the tokens and offsets from a seq of tokens" in {
val fsm = TokenFSM(Seq(Seq("quick", "brown"), Seq("lazy", "dog"), Seq("the", "quick", "fail")))
val matches = fsm.matches(sentence)
println(matches)
matches.length shouldBe 2
matches(0) shouldBe(Seq("quick", "brown"), (1, 2))
matches(1) shouldBe(Seq("lazy", "dog"), (7, 8))
}
it should "allow overlapping matches" in {
val fsm = TokenFSM(Seq(split("fluffy pink"), split("pink bunny"), split("pink")))
val matches = fsm.matches(split("the fluffy pink bunny is awesome"))
matches.length shouldBe 2
matches(0) shouldBe(split("fluffy pink"), (1, 2))
matches(1) shouldBe(split("pink bunny"), (2, 3))
}
it should "not match smaller token phrases that are contained in larger matches" in {
val fsm = TokenFSM(Seq(split("gold"), split("white gold"), split("yellow")))
val matches = fsm.matches(split("yellow and white gold"))
println(matches)
matches.length shouldBe 2
matches(0) shouldBe(split("yellow"), (0, 0))
matches(1) shouldBe(split("white gold"), (2, 3))
}
it should "have smaller phrases match if a larger phrase does not complete by the end of the sentence" in {
val fsm = TokenFSM(Seq(split("a"), split("b c"), split("b c d"), split("a b c d e f")))
val matches = fsm.matches(split("a b c d e"))
println(matches)
matches.length shouldBe 2
matches(0) shouldBe(split("a"), (0, 0))
matches(1) shouldBe(split("b c d"), (1, 3))
}
it should "remove duplicate matches" in {
val fsm = TokenFSM(Seq(split("a"), split("a b x"), split("a b y"), split("a b z")))
val matches = fsm.matches(split("a b c"))
matches.length shouldBe 1
matches(0) shouldBe(split("a"), (0, 0))
}
it should "not include smaller matches even when one dominant match fails but other matches" in {
val text = "yellow rose and white gold bracelet with diamonds Cartier, price upon request. Spike cuff, Giles & Brother by Philip Crangi, $88."
val fsm = TokenFSM(Seq(split("yellow"), split("rose"), split("white gold"), split("white pearls"), split("gold puree"),
split("gold"), split("white"), split("diamonds")))
val matches = fsm.matches(split(text))
println(s"matches: $matches")
matches shouldNot contain((Seq("white"), (3, 3)))
matches shouldNot contain((Seq("gold"), (4, 4)))
matches should contain((split("white gold"), (3, 4)))
matches.length shouldBe 4
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment