Created
August 8, 2017 14:11
-
-
Save anonymous/2fb24721e8dedbd97d76f75c25c9af7c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val tokenizer: Param[RegexTokenizer] = | |
new Param(this, "tokenizer", "Breaks the sentences into individual words.") | |
setDefault(tokenizer, new RegexTokenizer().setPattern("""[\p{Punct} ]""")) | |
val stopwordsRemover: Param[StopWordsRemover] = | |
new Param(this, "stopwords", "Drops stopwords from input text.") | |
// calling code needs to provide the list of stopwords, default is empty | |
setDefault(stopwordsRemover, new StopWordsRemover()) | |
// A convenience function to refer to columns in our pipeline | |
private val questionsCols = Array("question1", "question2") | |
private def questions(suffix: String) = questionsCols.map(_ + suffix) | |
private def tokenizePipeline(): Array[PipelineStage] = { | |
val mcTokenizer = new MultiColumnPipeline( | |
.setStage($(tokenizer)) | |
.setInputCols(questions("")) | |
.setOutputCols(questions("all_tokens")) | |
val mcStopwordsRemover = new MultiColumnPipeline() | |
.setStage($(stopwordsRemover)) | |
.setInputCols(mcTokenizer.getOutputCols) | |
.setOutputCols(questions("tokens")) | |
Array(mcTokenizer, mcStopwordsRemover) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment