Created
August 3, 2011 14:38
-
-
Save shreyaskarnik/1122788 to your computer and use it in GitHub Desktop.
Infer from Labeled LDA Model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// http://nlp.stanford.edu/software/tmt/0.3/ | |
// tells Scala where to find the TMT classes | |
import scalanlp.io._; | |
import scalanlp.stage._; | |
import scalanlp.stage.text._; | |
import scalanlp.text.tokenize._; | |
import scalanlp.pipes.Pipes.global._; | |
import edu.stanford.nlp.tmt.stage._; | |
import edu.stanford.nlp.tmt.model.lda._; | |
import edu.stanford.nlp.tmt.model.llda._; | |
if (args.length != 3) { | |
System.err.println("Arguments: modelPath input.tsv output.tsv"); | |
System.err.println(" modelPath: trained LLDA model"); | |
System.err.println(" input.tsv: path to input file with two tab separated columns: id, words"); | |
System.err.println(" output.tsv: id followed by (word (label:prob)*)* for each word in each doc"); | |
System.exit(-1); | |
} | |
val modelPath = file(args(0)); | |
val inputPath = file(args(1)); | |
val outputPath = file(args(2)); | |
System.err.println("Loading model ..."); | |
val lldaModel = LoadCVB0LabeledLDA(modelPath); | |
val model = lldaModel.asCVB0LDA; | |
val source = TSVFile(inputPath) ~> IDColumn(1); | |
val text = source ~> Column(2) ~> TokenizeWith(model.tokenizer.get); | |
val dataset = LDADataset(text,model.termIndex); | |
System.err.println("Generating output ..."); | |
val perDocTopicDistributions = | |
//InferCVB0DocumentTopicAssignments(model,dataset); | |
InferCVB0DocumentTopicDistributions(model, dataset); | |
//Works | |
TSVFile(outputPath).write(perDocTopicDistributions); | |
//val topTerms = QueryTopTopics(model, dataset, perDocWordTopicDistributions); |
@muggle98 I had the same error, which 'magically' fixed. I tried to reproduce it without effort. But I suspect it is fixed erasing all cache files created by the tmt, closing the program and running it again. Another caution I took is indexing the columns on the column id from 1, not from 0, and also Drop the first header if you have titles for the columns.
Also I erased the invalid .csv or .tsv outputs created by the inference when it failed.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi I tried this code but get the following error message, do you have an idea about it? Thanks!
Loading model ...
Generating output ...
[Concurrent] 32 permits
[Concurrent] 32 permits
Exception in thread "Thread-3" java.lang.IndexOutOfBoundsException: 1
at scala.collection.LinearSeqOptimized$class.apply(LinearSeqOptimized.scala:51)
at scala.collection.immutable.List.apply(List.scala:45)
at scalanlp.stage.Column.map(ColumnSelectors.scala:51)
at scalanlp.stage.Column.map(ColumnSelectors.scala:46)
at scalanlp.stage.generic.Mapper$$anonfun$apply$1$$anonfun$apply$2.apply(Mapper.scala:36)
at scalanlp.stage.Item.map(Item.scala:32)
at scalanlp.stage.generic.Mapper$$anonfun$apply$1.apply(Mapper.scala:36)
at scalanlp.stage.generic.Mapper$$anonfun$apply$1.apply(Mapper.scala:36)
at scala.collection.Iterator$$anon$19.next(Iterator.scala:335)
at scala.collection.Iterator$$anon$19.next(Iterator.scala:335)
at edu.stanford.nlp.tmt.data.concurrent.Concurrent$$anonfun$map$2.apply(Concurrent.scala:96)
at edu.stanford.nlp.tmt.data.concurrent.Concurrent$$anonfun$map$2.apply(Concurrent.scala:88)
at edu.stanford.nlp.tmt.data.concurrent.Concurrent$$anon$4.run(Concurrent.scala:45)