Skip to content

Instantly share code, notes, and snippets.

@Renien
Created May 25, 2016 15:52
Show Gist options
  • Save Renien/04ff4c1a43a18e18e03062b8d1daae9e to your computer and use it in GitHub Desktop.
Save Renien/04ff4c1a43a18e18e03062b8d1daae9e to your computer and use it in GitHub Desktop.
In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sequence of text or speech.
object NGram {
/**
* Split the sentence
* @param data documents
* @param splitter the delimiting regular expression
* @return the array of strings computed by splitting this string
* around matches of the given regular expression
*/
private def split(data: String, splitter: String): Seq[String] ={
data.split(splitter)
}
/**
* Process the document to generate the grams
* @param data documents
* @param splitter the delimiting regular expression
* @param nGramCount n-gram count
*/
def process(data: String, splitter: String = " ", nGramCount: Int = 2): Unit = {
val tokens = split(data, splitter)
var phrases = Seq[Seq[String]]()
if (tokens.length >= nGramCount) {
phrases = tokens.zipWithIndex.flatMap(t => {
if (t._2 + nGramCount <= tokens.length)
Seq(tokens.slice(t._2, t._2 + nGramCount))
else
Seq()
})
}
phrases.foreach(t=>println(t))
}
def main(args: Array[String]): Unit = {
process("Hello World I Love Scala", splitter = " ", nGramCount = 3)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment