Skip to content

Instantly share code, notes, and snippets.

Created August 7, 2017 13:57
Show Gist options
  • Save anonymous/d7cd8359d98a2067a3324ac9b06ad7ac to your computer and use it in GitHub Desktop.
Save anonymous/d7cd8359d98a2067a3324ac9b06ad7ac to your computer and use it in GitHub Desktop.
val lda: Param[LDA] =
new Param(this, "lda", "Convert each question into a weighted topic vector.")
setDefault(lda, new LDA())
private def ldaPipeline(): Array[PipelineStage] = {
// The "em" optimizer is distributed, supports serialization, but is disk hungry and slow.
// The "online" runs in the driver, is fast but cannot be serialized.
// We use the latter, since this model is only used to create a submission and nothing else.
val optimizer = "online"
val ldaEstimator = $(lda)
.setOptimizer(optimizer)
.setFeaturesCol("tmpinput").setTopicDistributionCol("tmpoutput")
val mcLda = new MultiColumnPipeline()
.setInputCols(questions("tfidf"))
.setOutputCols(questions("lda"))
.setStage(ldaEstimator, ldaEstimator.getFeaturesCol, ldaEstimator.getTopicDistributionCol)
Array(mcLda)
}
view raw
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment