Skip to content

Instantly share code, notes, and snippets.

@rainkinz
Last active December 17, 2015 17:19
Show Gist options
  • Save rainkinz/5645139 to your computer and use it in GitHub Desktop.
Save rainkinz/5645139 to your computer and use it in GitHub Desktop.
Probably a really bad example of counting terms and enumerating their positions in documents.
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.util.Random;
public class CountingTerms {
private static final Version VERSION = Version.LUCENE_43;
private static final String[] terms = "hi am mary and i have a problem with lucene".split(" ");
private final Directory indexDir = new RAMDirectory();
private String randomTerms() {
Random rand = new Random();
StringBuilder sb = new StringBuilder();
int numTerms = rand.nextInt(terms.length);
for (int i = 0; i < numTerms; i++) {
sb.append(terms[rand.nextInt(terms.length)]).append(" ");
}
return sb.toString();
}
private void addDocs(IndexWriter writer) throws IOException {
for (int i = 0; i < 10; i++) {
Document doc = new Document();
String randomStr = randomTerms();
puts("Adding random str: " + randomStr);
IndexableField field = new TextField("text", randomStr, Field.Store.YES);
doc.add(field);
writer.addDocument(doc);
}
}
private void countTerms() throws IOException {
DirectoryReader indexReader = DirectoryReader.open(indexDir);
AtomicReader reader = indexReader.leaves().get(0).reader();
Fields fields = reader.fields();
Terms terms = fields.terms("text");
TermsEnum termsEnum = terms.iterator(null);
BytesRef term;
while ((term = termsEnum.next()) != null) {
puts("---------------------------------------------------");
puts("Term '" + term.utf8ToString() + "' appears " + termsEnum.totalTermFreq() + " in the index");
DocsAndPositionsEnum docPosEnum = termsEnum.docsAndPositions(reader.getLiveDocs(),
null,
DocsAndPositionsEnum.FLAG_OFFSETS);
int docid;
while ((docid = docPosEnum.nextDoc()) != DocsAndPositionsEnum.NO_MORE_DOCS) {
int freq = docPosEnum.freq();
int[] positions = new int[freq];
for (int i = 0; i < freq; i++) {
int position = docPosEnum.nextPosition();
positions[i]=position;
}
puts("in doc " + docid + " the term " + term.utf8ToString() + " appears " + freq + " times at positions " + ppArray(positions));
}
}
indexReader.close();
}
private String ppArray(int[] arr) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < arr.length; i++) {
sb.append(arr[i]);
if (i + 1 < arr.length) sb.append(", ");
}
return sb.toString();
}
private void puts(Object msg) {
System.out.println(msg);
}
private void index() throws IOException {
IndexWriter indexWriter = new IndexWriter(indexDir,
new IndexWriterConfig(VERSION, new WhitespaceAnalyzer(VERSION)));
addDocs(indexWriter);
indexWriter.commit();
indexWriter.close();
}
public static void main(String[] args) throws Exception {
CountingTerms ct = new CountingTerms();
ct.index();
ct.countTerms();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment