Skip to content

Instantly share code, notes, and snippets.

@c0rp-aubakirov
Created October 20, 2016 07:15
Show Gist options
  • Save c0rp-aubakirov/5795d76b4c9f4840862f716ca359e807 to your computer and use it in GitHub Desktop.
Save c0rp-aubakirov/5795d76b4c9f4840862f716ca359e807 to your computer and use it in GitHub Desktop.
public void extractTfidfVector() throws Exception {
final IndexReader indexReader = indexerMany.readIndex();
final TFIDFSimilarity tfidfSIM = new DefaultSimilarity();
final Map<String, Double> termToTFIDF = new HashMap<>();
final Map<String, Integer> termToNumber = new HashMap<>();
final Fields fields = MultiFields.getFields(indexReader);
final Terms bodyTerms = fields.terms("body");
final TermsEnum iterator = bodyTerms.iterator();
BytesRef term;
while ((term = iterator.next()) != null) {
final String termString = term.utf8ToString();
final double tf = tfidfSIM.tf(iterator.docFreq());
int numDocs = 0;
if (iterator.seekExact(term)) {
final PostingsEnum docsEnum = iterator.postings(null);
while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
numDocs++;
}
}
final double idf = tfidfSIM.idf(numDocs, documents.size());
final double tfidf = idf * tf;
if (termString.length() > 4) {
termToTFIDF.put(termString, tfidf);
termToNumber.put(termString, termToNumber.size());
}
}
final RealMatrix matrix = new OpenMapRealMatrix(documents.size(), termToTFIDF.size());
final IndexSearcher searcher = new IndexSearcher(indexReader);
final Query q = new MatchAllDocsQuery();
final TopDocs docs = searcher.search(q, documents.size());
final ScoreDoc[] hits = docs.scoreDocs;
for (ScoreDoc hit : hits) {
final int docId = hit.doc;
final Fields termVectors = indexReader.getTermVectors(docId);
if (termVectors == null) continue;
final Terms mayBeBody = termVectors.terms("body");
if (mayBeBody == null) continue;
final TermsEnum body = mayBeBody.iterator();
final List<String> bodyTermList = getTerms(body);
for (String bodyTerm : bodyTermList) {
matrix.setEntry(docId, termToNumber.get(bodyTerm), termToTFIDF.get(bodyTerm));
}
}
try (final Writer writer = new FileWriterWithEncoding("/tmp/data.svm", Charsets.UTF_8)) {
for (int i = 0; i < hits.length; i++) {
final StringBuilder builder = new StringBuilder();
final String type = indexReader.document(i).getField("type").stringValue();
builder.append(MessageType.valueOf(type).equals(MessageType.NOTIFICATION) ? 1 : 0).append(",");
final double[] row = matrix.getRow(i);
for (double v : row) {
builder.append(v).append("\t");
}
builder.append("\n");
writer.write(builder.toString());
writer.flush();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment