Created
May 25, 2016 15:52
-
-
Save Renien/04ff4c1a43a18e18e03062b8d1daae9e to your computer and use it in GitHub Desktop.
In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sequence of text or speech.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object NGram { | |
/** | |
* Split the sentence | |
* @param data documents | |
* @param splitter the delimiting regular expression | |
* @return the array of strings computed by splitting this string | |
* around matches of the given regular expression | |
*/ | |
private def split(data: String, splitter: String): Seq[String] ={ | |
data.split(splitter) | |
} | |
/** | |
* Process the document to generate the grams | |
* @param data documents | |
* @param splitter the delimiting regular expression | |
* @param nGramCount n-gram count | |
*/ | |
def process(data: String, splitter: String = " ", nGramCount: Int = 2): Unit = { | |
val tokens = split(data, splitter) | |
var phrases = Seq[Seq[String]]() | |
if (tokens.length >= nGramCount) { | |
phrases = tokens.zipWithIndex.flatMap(t => { | |
if (t._2 + nGramCount <= tokens.length) | |
Seq(tokens.slice(t._2, t._2 + nGramCount)) | |
else | |
Seq() | |
}) | |
} | |
phrases.foreach(t=>println(t)) | |
} | |
def main(args: Array[String]): Unit = { | |
process("Hello World I Love Scala", splitter = " ", nGramCount = 3) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment