Renien/n-gram-scala.scala

## n-gram-scala.scala
object NGram {

  /**
   * Split the sentence
   * @param data documents
   * @param splitter the delimiting regular expression
   * @return the array of strings computed by splitting this string
   *          around matches of the given regular expression
   */
  private def split(data: String, splitter: String): Seq[String] ={
    data.split(splitter)
  }

  /**
   * Process the document to generate the grams
   * @param data documents
   * @param splitter the delimiting regular expression
   * @param nGramCount n-gram count
   */
  def process(data: String, splitter: String = " ", nGramCount: Int = 2): Unit = {
    val tokens = split(data, splitter)
    var phrases = Seq[Seq[String]]()
    if (tokens.length >= nGramCount) {
      phrases = tokens.zipWithIndex.flatMap(t => {
        if (t._2 + nGramCount <= tokens.length)
            Seq(tokens.slice(t._2, t._2 + nGramCount))
        else
            Seq()
      })
    }
    phrases.foreach(t=>println(t))
  }

  def main(args: Array[String]): Unit = {
    process("Hello World I Love Scala", splitter = " ", nGramCount = 3)

  }
}
	object NGram {

	/**
	* Split the sentence
	* @param data documents
	* @param splitter the delimiting regular expression
	* @return the array of strings computed by splitting this string
	* around matches of the given regular expression
	*/
	private def split(data: String, splitter: String): Seq[String] ={
	data.split(splitter)
	}

	/**
	* Process the document to generate the grams
	* @param data documents
	* @param splitter the delimiting regular expression
	* @param nGramCount n-gram count
	*/
	def process(data: String, splitter: String = " ", nGramCount: Int = 2): Unit = {
	val tokens = split(data, splitter)
	var phrases = Seq[Seq[String]]()
	if (tokens.length >= nGramCount) {
	phrases = tokens.zipWithIndex.flatMap(t => {
	if (t._2 + nGramCount <= tokens.length)
	Seq(tokens.slice(t._2, t._2 + nGramCount))
	else
	Seq()
	})
	}
	phrases.foreach(t=>println(t))
	}

	def main(args: Array[String]): Unit = {
	process("Hello World I Love Scala", splitter = " ", nGramCount = 3)

	}
	}