Last active
December 17, 2015 17:19
-
-
Save rainkinz/5645139 to your computer and use it in GitHub Desktop.
Probably a really bad example of counting terms and enumerating their positions in documents.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.TextField; | |
import org.apache.lucene.index.*; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.lucene.util.BytesRef; | |
import org.apache.lucene.util.Version; | |
import java.io.IOException; | |
import java.util.Random; | |
public class CountingTerms { | |
private static final Version VERSION = Version.LUCENE_43; | |
private static final String[] terms = "hi am mary and i have a problem with lucene".split(" "); | |
private final Directory indexDir = new RAMDirectory(); | |
private String randomTerms() { | |
Random rand = new Random(); | |
StringBuilder sb = new StringBuilder(); | |
int numTerms = rand.nextInt(terms.length); | |
for (int i = 0; i < numTerms; i++) { | |
sb.append(terms[rand.nextInt(terms.length)]).append(" "); | |
} | |
return sb.toString(); | |
} | |
private void addDocs(IndexWriter writer) throws IOException { | |
for (int i = 0; i < 10; i++) { | |
Document doc = new Document(); | |
String randomStr = randomTerms(); | |
puts("Adding random str: " + randomStr); | |
IndexableField field = new TextField("text", randomStr, Field.Store.YES); | |
doc.add(field); | |
writer.addDocument(doc); | |
} | |
} | |
private void countTerms() throws IOException { | |
DirectoryReader indexReader = DirectoryReader.open(indexDir); | |
AtomicReader reader = indexReader.leaves().get(0).reader(); | |
Fields fields = reader.fields(); | |
Terms terms = fields.terms("text"); | |
TermsEnum termsEnum = terms.iterator(null); | |
BytesRef term; | |
while ((term = termsEnum.next()) != null) { | |
puts("---------------------------------------------------"); | |
puts("Term '" + term.utf8ToString() + "' appears " + termsEnum.totalTermFreq() + " in the index"); | |
DocsAndPositionsEnum docPosEnum = termsEnum.docsAndPositions(reader.getLiveDocs(), | |
null, | |
DocsAndPositionsEnum.FLAG_OFFSETS); | |
int docid; | |
while ((docid = docPosEnum.nextDoc()) != DocsAndPositionsEnum.NO_MORE_DOCS) { | |
int freq = docPosEnum.freq(); | |
int[] positions = new int[freq]; | |
for (int i = 0; i < freq; i++) { | |
int position = docPosEnum.nextPosition(); | |
positions[i]=position; | |
} | |
puts("in doc " + docid + " the term " + term.utf8ToString() + " appears " + freq + " times at positions " + ppArray(positions)); | |
} | |
} | |
indexReader.close(); | |
} | |
private String ppArray(int[] arr) { | |
StringBuilder sb = new StringBuilder(); | |
for (int i = 0; i < arr.length; i++) { | |
sb.append(arr[i]); | |
if (i + 1 < arr.length) sb.append(", "); | |
} | |
return sb.toString(); | |
} | |
private void puts(Object msg) { | |
System.out.println(msg); | |
} | |
private void index() throws IOException { | |
IndexWriter indexWriter = new IndexWriter(indexDir, | |
new IndexWriterConfig(VERSION, new WhitespaceAnalyzer(VERSION))); | |
addDocs(indexWriter); | |
indexWriter.commit(); | |
indexWriter.close(); | |
} | |
public static void main(String[] args) throws Exception { | |
CountingTerms ct = new CountingTerms(); | |
ct.index(); | |
ct.countTerms(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment