Skip to content

Instantly share code, notes, and snippets.

@c0rp-aubakirov
Created October 20, 2016 07:14
Show Gist options
  • Save c0rp-aubakirov/bfbdf0c7881e463c24fad186cc2402ef to your computer and use it in GitHub Desktop.
Save c0rp-aubakirov/bfbdf0c7881e463c24fad186cc2402ef to your computer and use it in GitHub Desktop.
package kz.moe.classifier.index;
import kz.moe.parser.model.MessageType;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.*;
/**
* User: Sanzhar Aubakirov
* Date: 1/6/16
*/
public class TfIdfExtractor {
private final static TFIDFSimilarity tfidfSIM = new DefaultSimilarity();
public static Vector extract(int docId, TermsEnum termsEnum, MessageType type, int numberOfAllDocuments) throws IOException {
final Map<String, Double> tfidf = new HashMap<>();
final Set<String> test = new HashSet<>();
BytesRef bytesRef;
while ((bytesRef = termsEnum.next()) != null) {
final String term = bytesRef.utf8ToString();
if (term.length() <= 4) continue;
final int docFreq = termsEnum.docFreq();
if (!test.contains(term)) {
test.add(term);
} else {
System.out.println("term = " + term);
}
if (docFreq != 1) {
System.out.println(docFreq);
}
final double idf = (double) tfidfSIM.idf(docFreq, numberOfAllDocuments);
double tf = 0;
if (termsEnum.seekExact(bytesRef)) {
final PostingsEnum docsEnum = termsEnum.postings(null);
while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
tf += (double) tfidfSIM.tf(docsEnum.freq());
}
}
tfidf.put(term, idf * tf);
}
return new Vector().docId(docId).tfidf(tfidf).type(type);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment