Created
October 20, 2016 07:15
-
-
Save c0rp-aubakirov/5795d76b4c9f4840862f716ca359e807 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public void extractTfidfVector() throws Exception { | |
final IndexReader indexReader = indexerMany.readIndex(); | |
final TFIDFSimilarity tfidfSIM = new DefaultSimilarity(); | |
final Map<String, Double> termToTFIDF = new HashMap<>(); | |
final Map<String, Integer> termToNumber = new HashMap<>(); | |
final Fields fields = MultiFields.getFields(indexReader); | |
final Terms bodyTerms = fields.terms("body"); | |
final TermsEnum iterator = bodyTerms.iterator(); | |
BytesRef term; | |
while ((term = iterator.next()) != null) { | |
final String termString = term.utf8ToString(); | |
final double tf = tfidfSIM.tf(iterator.docFreq()); | |
int numDocs = 0; | |
if (iterator.seekExact(term)) { | |
final PostingsEnum docsEnum = iterator.postings(null); | |
while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { | |
numDocs++; | |
} | |
} | |
final double idf = tfidfSIM.idf(numDocs, documents.size()); | |
final double tfidf = idf * tf; | |
if (termString.length() > 4) { | |
termToTFIDF.put(termString, tfidf); | |
termToNumber.put(termString, termToNumber.size()); | |
} | |
} | |
final RealMatrix matrix = new OpenMapRealMatrix(documents.size(), termToTFIDF.size()); | |
final IndexSearcher searcher = new IndexSearcher(indexReader); | |
final Query q = new MatchAllDocsQuery(); | |
final TopDocs docs = searcher.search(q, documents.size()); | |
final ScoreDoc[] hits = docs.scoreDocs; | |
for (ScoreDoc hit : hits) { | |
final int docId = hit.doc; | |
final Fields termVectors = indexReader.getTermVectors(docId); | |
if (termVectors == null) continue; | |
final Terms mayBeBody = termVectors.terms("body"); | |
if (mayBeBody == null) continue; | |
final TermsEnum body = mayBeBody.iterator(); | |
final List<String> bodyTermList = getTerms(body); | |
for (String bodyTerm : bodyTermList) { | |
matrix.setEntry(docId, termToNumber.get(bodyTerm), termToTFIDF.get(bodyTerm)); | |
} | |
} | |
try (final Writer writer = new FileWriterWithEncoding("/tmp/data.svm", Charsets.UTF_8)) { | |
for (int i = 0; i < hits.length; i++) { | |
final StringBuilder builder = new StringBuilder(); | |
final String type = indexReader.document(i).getField("type").stringValue(); | |
builder.append(MessageType.valueOf(type).equals(MessageType.NOTIFICATION) ? 1 : 0).append(","); | |
final double[] row = matrix.getRow(i); | |
for (double v : row) { | |
builder.append(v).append("\t"); | |
} | |
builder.append("\n"); | |
writer.write(builder.toString()); | |
writer.flush(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment