Skip to content

Instantly share code, notes, and snippets.

@masud-technope
Created July 16, 2018 19:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masud-technope/910b84bb6374b7e77d5b3283a76446ee to your computer and use it in GitHub Desktop.
Save masud-technope/910b84bb6374b7e77d5b3283a76446ee to your computer and use it in GitHub Desktop.
IDF Calculation with Lucene
public HashMap<String, Double> calculateIDFOnly() {
IndexReader reader = null;
HashMap<String, Double> inverseDFMap = new HashMap<>();
try {
reader = DirectoryReader.open(FSDirectory
.open(new File(indexFolder).toPath()));
// String targetTerm = "breakpoint";
Fields fields = MultiFields.getFields(reader);
for (String field : fields) {
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator();
BytesRef bytesRef;
while ((bytesRef = termsEnum.next()) != null) {
if (termsEnum.seekExact(bytesRef)) {
String term = bytesRef.utf8ToString();
this.keys.add(term);
}
}
}
// now go for the IDF
int N = reader.numDocs();
double maxIDF = 0;
for (String term : this.keys) {
Term t = new Term(FIELD_CONTENTS, term);
int docFreq = reader.docFreq(t);
double idf = getIDF(N, docFreq);
if (!inverseDFMap.containsKey(term)) {
inverseDFMap.put(term, idf);
if (idf > maxIDF) {
maxIDF = idf;
}
}
}
// now normalize the IDF scores
for (String term : this.keys) {
double idf = inverseDFMap.get(term);
idf = idf / maxIDF;
inverseDFMap.put(term, idf);
}
} catch (Exception exc) {
// handle the exception
}
return inverseDFMap;
}
public HashMap<String, Double> calculateIDFOnly() {
IndexReader reader = null;
HashMap<String, Double> inverseDFMap = new HashMap<>();
try {
reader = DirectoryReader.open(FSDirectory
.open(new File(indexFolder).toPath()));
// String targetTerm = "breakpoint";
Fields fields = MultiFields.getFields(reader);
for (String field : fields) {
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator();
BytesRef bytesRef;
while ((bytesRef = termsEnum.next()) != null) {
if (termsEnum.seekExact(bytesRef)) {
String term = bytesRef.utf8ToString();
this.keys.add(term);
}
}
}
// now go for the IDF
int N = reader.numDocs();
double maxIDF = 0;
for (String term : this.keys) {
Term t = new Term(FIELD_CONTENTS, term);
int docFreq = reader.docFreq(t);
double idf = getIDF(N, docFreq);
if (!inverseDFMap.containsKey(term)) {
inverseDFMap.put(term, idf);
if (idf > maxIDF) {
maxIDF = idf;
}
}
}
// now normalize the IDF scores
for (String term : this.keys) {
double idf = inverseDFMap.get(term);
idf = idf / maxIDF;
inverseDFMap.put(term, idf);
}
} catch (Exception exc) {
// handle the exception
}
return inverseDFMap;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment