Skip to content

Instantly share code, notes, and snippets.

Created August 8, 2017 14:11
Show Gist options
  • Save anonymous/2fb24721e8dedbd97d76f75c25c9af7c to your computer and use it in GitHub Desktop.
Save anonymous/2fb24721e8dedbd97d76f75c25c9af7c to your computer and use it in GitHub Desktop.
val tokenizer: Param[RegexTokenizer] =
new Param(this, "tokenizer", "Breaks the sentences into individual words.")
setDefault(tokenizer, new RegexTokenizer().setPattern("""[\p{Punct} ]"""))
val stopwordsRemover: Param[StopWordsRemover] =
new Param(this, "stopwords", "Drops stopwords from input text.")
// calling code needs to provide the list of stopwords, default is empty
setDefault(stopwordsRemover, new StopWordsRemover())
// A convenience function to refer to columns in our pipeline
private val questionsCols = Array("question1", "question2")
private def questions(suffix: String) = questionsCols.map(_ + suffix)
private def tokenizePipeline(): Array[PipelineStage] = {
val mcTokenizer = new MultiColumnPipeline(
.setStage($(tokenizer))
.setInputCols(questions(""))
.setOutputCols(questions("all_tokens"))
val mcStopwordsRemover = new MultiColumnPipeline()
.setStage($(stopwordsRemover))
.setInputCols(mcTokenizer.getOutputCols)
.setOutputCols(questions("tokens"))
Array(mcTokenizer, mcStopwordsRemover)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment