retnuh/TokenFSM.scala

## TokenFSM.scala
package tokenfsm

import scala.annotation.tailrec
import scala.collection.immutable.SortedSet

/**
  * Created by retnuh on 04/07/2016.
  */

object TokenFSM {
  private implicit def lengthOrdering[T](implicit ordering: Ordering[T]) = new Ordering[Seq[T]] {
    override def compare(x: Seq[T], y: Seq[T]): Int = {
      val lc = x.lengthCompare(y.length)
      if (lc == 0) {
        val fc = ordering.compare(x.head, y.head)
        if (fc == 0)
          return compare(x.tail, y.tail)
        return fc
      }
      return lc
    }
  }

  def apply[T](states: Seq[Seq[T]])(implicit ordering: Ordering[T]) = {
    val initialStates = states.foldLeft(Map.empty[T, SortedSet[Seq[T]]])((m, ts) => {
      m + (ts.head -> (m.getOrElse(ts.head, SortedSet.empty[Seq[T]](lengthOrdering)) + ts))
    })
    new TokenFSM(initialStates)
  }
}

class TokenFSM[T](initialStates: Map[T, SortedSet[Seq[T]]])(implicit ordering: Ordering[T]) {

  import TokenFSM._

  @tailrec
  private def matchEach(tokens: Seq[T], pos: Int, inProgress: MachineState[T]): Seq[(Seq[T], (Int, Int))] = {
    if (tokens.isEmpty) {
      println(inProgress)
      println(inProgress.complete)
      return inProgress.complete.collect({ case Matched(t, s, e) => (t, (s, e)) })
    }
    println("=================")
    val head = tokens.head
    println(s"head: $head")
    val dominant: MachineState[T] = initialStates.getOrElse(head, SortedSet.empty[Seq[T]](lengthOrdering))
      .foldRight(inProgress)((ts: Seq[T], dominant: MachineState[T]) => dominant.dominate(Matching(ts, 0, pos)))
    println(s"dominant before update: $dominant")
    println("------------------")
    val updated: MachineState[T] = dominant.nextToken(head, pos)
    println(s"updated: $updated")
    matchEach(tokens.tail, pos + 1, updated)
  }

  def matches(tokens: Seq[T]): Seq[(Seq[T], (Int, Int))] = {
    matchEach(tokens, 0, NonMatch[T]())
  }

}

sealed trait MachineState[T] {
  def nextToken(token: T, offset: Int): MachineState[T]

  def complete: Seq[MachineState[T]]

  def dominate(subject: MachineState[T]): MachineState[T] = (this, subject) match {
    case (NonMatch(), s) => s
    case (t, NonMatch()) => t
    case (DominatingMatching(d, os), ns) => println(s"$d already dominates $os now dominating $ns"); DominatingMatching(d, os.dominate(ns))
    case (t, s) => println(s"$t dominating $s"); DominatingMatching(t, s)
  }
}

trait InProgress[T] extends MachineState[T]

case class DominatingMatching[T](dominant: MachineState[T], subjugated: MachineState[T]) extends InProgress[T] {

  override def nextToken(token: T, offset: Int): MachineState[T] = {
    val d = dominant.nextToken(token, offset)
    val subj = subjugated.nextToken(token, offset)
    d match {
        // If we don't match, just fallback to our subjugated match
      case NonMatch() => subj
      // If the dominant has matched, it overrides any subjects that have matched and have <= end.
      // We have to keep going, though, in cases where a subjugated match fails at a later point in time and returns
      // a match that we dominate.  So we just keep going until complete.
      case _ => DominatingMatching(d, subj)
    }
  }

  override def complete: Seq[MachineState[T]] = {
    val d = dominant.complete
    val subjs = subjugated.complete
    d match {
      // If the dominant has matched, it overrides any subjects that have matched and have <= end.
      // We have to keep going, though, in cases where a subjugated match fails at a later point in time and returns
      // a match that we dominate.  So we just keep going until complete.
      case Seq(x: Matched[T]) => x +: subjs.collect({ case s: Matched[T] if s.end > x.end => s })
      // If we haven't matched, just fallback to our subjugated match
      case _ => subjs
    }
  }
}

case class Matching[T](tokens: Seq[T], pos: Int, start: Int) extends InProgress[T] {
  override def nextToken(token: T, offset: Int): MachineState[T] = {
    val nextPos = pos + 1
    if (tokens(pos) == token) {
      if (nextPos == tokens.length)
        Matched(tokens, start, offset)
      else
        Matching(tokens, pos + 1, start)
    } else {
      NonMatch[T]()
    }
  }

  override def complete: Seq[MachineState[T]] = Seq.empty
}

case class Matched[T](tokens: Seq[T], start: Int, end: Int) extends MachineState[T] {
  override def nextToken(token: T, offset: Int): MachineState[T] = this

  override def complete: Seq[MachineState[T]] = Seq(this)
}

case class NonMatch[T]() extends MachineState[T] {
  override def nextToken(token: T, offset: Int): MachineState[T] = this

  override def complete: Seq[MachineState[T]] = Seq.empty
}

## TokenFSMSpecs.scala
package tokenfsm

import org.scalatest.Matchers

/**
  * Created by retnuh on 04/07/2016.
  */
class TokenFSMSpecs extends org.scalatest.FlatSpec with Matchers {

  val ws = """\s+""".r

  def split(s: String) = ws.pattern.split(s).to[Seq]

  val sentence = split("the quick brown fox jumped over the lazy dog")

  "A TokenFSM" should "match the tokens and offsets from a seq of tokens" in {
    val fsm = TokenFSM(Seq(Seq("quick", "brown"), Seq("lazy", "dog"), Seq("the", "quick", "fail")))
    val matches = fsm.matches(sentence)
    println(matches)
    matches.length shouldBe 2
    matches(0) shouldBe(Seq("quick", "brown"), (1, 2))
    matches(1) shouldBe(Seq("lazy", "dog"), (7, 8))
  }

  it should "allow overlapping matches" in {
    val fsm = TokenFSM(Seq(split("fluffy pink"), split("pink bunny"), split("pink")))
    val matches = fsm.matches(split("the fluffy pink bunny is awesome"))
    matches.length shouldBe 2
    matches(0) shouldBe(split("fluffy pink"), (1, 2))
    matches(1) shouldBe(split("pink bunny"), (2, 3))
  }

  it should "not match smaller token phrases that are contained in larger matches" in {
    val fsm = TokenFSM(Seq(split("gold"), split("white gold"), split("yellow")))
    val matches = fsm.matches(split("yellow and white gold"))
    println(matches)
    matches.length shouldBe 2
    matches(0) shouldBe(split("yellow"), (0, 0))
    matches(1) shouldBe(split("white gold"), (2, 3))
  }

  it should "have smaller phrases match if a larger phrase does not complete by the end of the sentence" in {
    val fsm = TokenFSM(Seq(split("a"), split("b c"), split("b c d"), split("a b c d e f")))
    val matches = fsm.matches(split("a b c d e"))
    println(matches)
    matches.length shouldBe 2
    matches(0) shouldBe(split("a"), (0, 0))
    matches(1) shouldBe(split("b c d"), (1, 3))
  }

  it should "remove duplicate matches" in {
    val fsm = TokenFSM(Seq(split("a"), split("a b x"), split("a b y"), split("a b z")))
    val matches = fsm.matches(split("a b c"))
    matches.length shouldBe 1
    matches(0) shouldBe(split("a"), (0, 0))
  }

  it should "not include smaller matches even when one dominant match fails but other matches" in {

    val text = "yellow rose and white gold bracelet with diamonds Cartier, price upon request. Spike cuff, Giles & Brother by Philip Crangi, $88."
    val fsm = TokenFSM(Seq(split("yellow"), split("rose"), split("white gold"), split("white pearls"), split("gold puree"),
      split("gold"), split("white"), split("diamonds")))

    val matches = fsm.matches(split(text))
    println(s"matches: $matches")
    matches shouldNot contain((Seq("white"), (3, 3)))
    matches shouldNot contain((Seq("gold"), (4, 4)))
    matches should contain((split("white gold"), (3, 4)))
    matches.length shouldBe 4
  }
}
	package tokenfsm

	import scala.annotation.tailrec
	import scala.collection.immutable.SortedSet

	/**
	* Created by retnuh on 04/07/2016.
	*/

	object TokenFSM {
	private implicit def lengthOrdering[T](implicit ordering: Ordering[T]) = new Ordering[Seq[T]] {
	override def compare(x: Seq[T], y: Seq[T]): Int = {
	val lc = x.lengthCompare(y.length)
	if (lc == 0) {
	val fc = ordering.compare(x.head, y.head)
	if (fc == 0)
	return compare(x.tail, y.tail)
	return fc
	}
	return lc
	}
	}

	def apply[T](states: Seq[Seq[T]])(implicit ordering: Ordering[T]) = {
	val initialStates = states.foldLeft(Map.empty[T, SortedSet[Seq[T]]])((m, ts) => {
	m + (ts.head -> (m.getOrElse(ts.head, SortedSet.empty[Seq[T]](lengthOrdering)) + ts))
	})
	new TokenFSM(initialStates)
	}
	}

	class TokenFSM[T](initialStates: Map[T, SortedSet[Seq[T]]])(implicit ordering: Ordering[T]) {

	import TokenFSM._

	@tailrec
	private def matchEach(tokens: Seq[T], pos: Int, inProgress: MachineState[T]): Seq[(Seq[T], (Int, Int))] = {
	if (tokens.isEmpty) {
	println(inProgress)
	println(inProgress.complete)
	return inProgress.complete.collect({ case Matched(t, s, e) => (t, (s, e)) })
	}
	println("=================")
	val head = tokens.head
	println(s"head: $head")
	val dominant: MachineState[T] = initialStates.getOrElse(head, SortedSet.empty[Seq[T]](lengthOrdering))
	.foldRight(inProgress)((ts: Seq[T], dominant: MachineState[T]) => dominant.dominate(Matching(ts, 0, pos)))
	println(s"dominant before update: $dominant")
	println("------------------")
	val updated: MachineState[T] = dominant.nextToken(head, pos)
	println(s"updated: $updated")
	matchEach(tokens.tail, pos + 1, updated)
	}

	def matches(tokens: Seq[T]): Seq[(Seq[T], (Int, Int))] = {
	matchEach(tokens, 0, NonMatch[T]())
	}

	}

	sealed trait MachineState[T] {
	def nextToken(token: T, offset: Int): MachineState[T]

	def complete: Seq[MachineState[T]]

	def dominate(subject: MachineState[T]): MachineState[T] = (this, subject) match {
	case (NonMatch(), s) => s
	case (t, NonMatch()) => t
	case (DominatingMatching(d, os), ns) => println(s"$d already dominates $os now dominating $ns"); DominatingMatching(d, os.dominate(ns))
	case (t, s) => println(s"$t dominating $s"); DominatingMatching(t, s)
	}
	}

	trait InProgress[T] extends MachineState[T]

	case class DominatingMatching[T](dominant: MachineState[T], subjugated: MachineState[T]) extends InProgress[T] {

	override def nextToken(token: T, offset: Int): MachineState[T] = {
	val d = dominant.nextToken(token, offset)
	val subj = subjugated.nextToken(token, offset)
	d match {
	// If we don't match, just fallback to our subjugated match
	case NonMatch() => subj
	// If the dominant has matched, it overrides any subjects that have matched and have <= end.
	// We have to keep going, though, in cases where a subjugated match fails at a later point in time and returns
	// a match that we dominate. So we just keep going until complete.
	case _ => DominatingMatching(d, subj)
	}
	}

	override def complete: Seq[MachineState[T]] = {
	val d = dominant.complete
	val subjs = subjugated.complete
	d match {
	// If the dominant has matched, it overrides any subjects that have matched and have <= end.
	// We have to keep going, though, in cases where a subjugated match fails at a later point in time and returns
	// a match that we dominate. So we just keep going until complete.
	case Seq(x: Matched[T]) => x +: subjs.collect({ case s: Matched[T] if s.end > x.end => s })
	// If we haven't matched, just fallback to our subjugated match
	case _ => subjs
	}
	}
	}

	case class Matching[T](tokens: Seq[T], pos: Int, start: Int) extends InProgress[T] {
	override def nextToken(token: T, offset: Int): MachineState[T] = {
	val nextPos = pos + 1
	if (tokens(pos) == token) {
	if (nextPos == tokens.length)
	Matched(tokens, start, offset)
	else
	Matching(tokens, pos + 1, start)
	} else {
	NonMatch[T]()
	}
	}

	override def complete: Seq[MachineState[T]] = Seq.empty
	}

	case class Matched[T](tokens: Seq[T], start: Int, end: Int) extends MachineState[T] {
	override def nextToken(token: T, offset: Int): MachineState[T] = this

	override def complete: Seq[MachineState[T]] = Seq(this)
	}

	case class NonMatch[T]() extends MachineState[T] {
	override def nextToken(token: T, offset: Int): MachineState[T] = this

	override def complete: Seq[MachineState[T]] = Seq.empty
	}
	package tokenfsm

	import org.scalatest.Matchers

	/**
	* Created by retnuh on 04/07/2016.
	*/
	class TokenFSMSpecs extends org.scalatest.FlatSpec with Matchers {

	val ws = """\s+""".r

	def split(s: String) = ws.pattern.split(s).to[Seq]

	val sentence = split("the quick brown fox jumped over the lazy dog")

	"A TokenFSM" should "match the tokens and offsets from a seq of tokens" in {
	val fsm = TokenFSM(Seq(Seq("quick", "brown"), Seq("lazy", "dog"), Seq("the", "quick", "fail")))
	val matches = fsm.matches(sentence)
	println(matches)
	matches.length shouldBe 2
	matches(0) shouldBe(Seq("quick", "brown"), (1, 2))
	matches(1) shouldBe(Seq("lazy", "dog"), (7, 8))
	}

	it should "allow overlapping matches" in {
	val fsm = TokenFSM(Seq(split("fluffy pink"), split("pink bunny"), split("pink")))
	val matches = fsm.matches(split("the fluffy pink bunny is awesome"))
	matches.length shouldBe 2
	matches(0) shouldBe(split("fluffy pink"), (1, 2))
	matches(1) shouldBe(split("pink bunny"), (2, 3))
	}

	it should "not match smaller token phrases that are contained in larger matches" in {
	val fsm = TokenFSM(Seq(split("gold"), split("white gold"), split("yellow")))
	val matches = fsm.matches(split("yellow and white gold"))
	println(matches)
	matches.length shouldBe 2
	matches(0) shouldBe(split("yellow"), (0, 0))
	matches(1) shouldBe(split("white gold"), (2, 3))
	}

	it should "have smaller phrases match if a larger phrase does not complete by the end of the sentence" in {
	val fsm = TokenFSM(Seq(split("a"), split("b c"), split("b c d"), split("a b c d e f")))
	val matches = fsm.matches(split("a b c d e"))
	println(matches)
	matches.length shouldBe 2
	matches(0) shouldBe(split("a"), (0, 0))
	matches(1) shouldBe(split("b c d"), (1, 3))
	}

	it should "remove duplicate matches" in {
	val fsm = TokenFSM(Seq(split("a"), split("a b x"), split("a b y"), split("a b z")))
	val matches = fsm.matches(split("a b c"))
	matches.length shouldBe 1
	matches(0) shouldBe(split("a"), (0, 0))
	}

	it should "not include smaller matches even when one dominant match fails but other matches" in {

	val text = "yellow rose and white gold bracelet with diamonds Cartier, price upon request. Spike cuff, Giles & Brother by Philip Crangi, $88."
	val fsm = TokenFSM(Seq(split("yellow"), split("rose"), split("white gold"), split("white pearls"), split("gold puree"),
	split("gold"), split("white"), split("diamonds")))

	val matches = fsm.matches(split(text))
	println(s"matches: $matches")
	matches shouldNot contain((Seq("white"), (3, 3)))
	matches shouldNot contain((Seq("gold"), (4, 4)))
	matches should contain((split("white gold"), (3, 4)))
	matches.length shouldBe 4
	}
	}