Skip to content

Instantly share code, notes, and snippets.

@eribeiro
Last active November 28, 2019 21:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eribeiro/ebb24feb3fd84931b7c288b9b716ed49 to your computer and use it in GitHub Desktop.
Save eribeiro/ebb24feb3fd84931b7c288b9b716ed49 to your computer and use it in GitHub Desktop.
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Paths;
public class TestIndexer {
public static void main(String[] args) throws IOException {
Directory dir = FSDirectory.open(Paths.get("/tmp/test/"));
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(getAnalyzer()));
Document document = new Document();
FieldType fieldType = new FieldType();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
fieldType.setTokenized(true);
fieldType.setStored(true);
fieldType.setOmitNorms(true);
fieldType.setStoreTermVectorOffsets(false);
fieldType.setStoreTermVectorPositions(false);
fieldType.freeze();
Field text = new Field("text", "a|10 b|23 c|90", fieldType);
document.add(text);
indexWriter.addDocument(document);
indexWriter.commit();
IndexReader reader = DirectoryReader.open(indexWriter);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 100);
System.out.println(searcher.collectionStatistics("text"));
for (int i = 0; i < topDocs.totalHits.value; i++) {
Document doc = searcher.doc(topDocs.scoreDocs[i].doc);
IndexableField f = doc.getField("text");
System.out.println(f.stringValue());
System.out.println("SumTotalTermFreq: " + reader.getSumTotalTermFreq("text"));
System.out.println("SumDocFreq: " + reader.getSumDocFreq("text"));
}
}
private static Analyzer getAnalyzer() {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenFilter tokenFilter = new DelimitedTermFrequencyTokenFilter(tokenizer);
TokenFilter stopFilter = new StopFilter(tokenFilter, CharArraySet.EMPTY_SET);
return new TokenStreamComponents(tokenizer, stopFilter);
}
};
}
}
@eribeiro
Copy link
Author

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment