Created
July 13, 2018 20:18
-
-
Save masud-technope/a87484a1689b603a820b91def75882b2 to your computer and use it in GitHub Desktop.
Lucene TF calculation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static final String FIELD_CONTENTS = "contents"; | |
public HashMap<String, Long> calculateTF() { | |
HashMap<String, Long> termFreqMap = new HashMap<>(); | |
try { | |
IndexReader reader = DirectoryReader.open(FSDirectory | |
.open(new File(indexFolder).toPath())); | |
// String targetTerm = "breakpoint"; | |
Fields fields = MultiFields.getFields(reader); | |
for (String field : fields) { | |
Terms terms = fields.terms(field); | |
TermsEnum termsEnum = terms.iterator(); | |
BytesRef bytesRef; | |
while ((bytesRef = termsEnum.next()) != null) { | |
if (termsEnum.seekExact(bytesRef)) { | |
String term = bytesRef.utf8ToString(); | |
this.keys.add(term); | |
} | |
} | |
} | |
for (String term : this.keys) { | |
Term t = new Term(FIELD_CONTENTS, term); | |
// calculating the TF | |
long totalTermFreq = reader.totalTermFreq(t); | |
if (!termFreqMap.containsKey(term)) { | |
termFreqMap.put(term, totalTermFreq); | |
totalTermFreqCorpus += totalTermFreq; | |
} | |
} | |
} catch (Exception exc) { | |
// handle the exception | |
} | |
return termFreqMap; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment