Skip to content

Instantly share code, notes, and snippets.

@gilinachum
Created June 14, 2015 12:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gilinachum/a297b743b3b05bb69888 to your computer and use it in GitHub Desktop.
Save gilinachum/a297b743b3b05bb69888 to your computer and use it in GitHub Desktop.
Cost of fields in Lucene index
import java.io.File;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.util.Random;
import java.util.UUID;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class FieldsIndexingMemTest {
private static IndexReader ireader;
private static IndexSearcher isearcher;
private static FSDirectory directory;
private static Analyzer analyzer;
private static QueryParser parser;
private static IndexWriter iwriter;
private static Random random = new Random();
private enum Mode {
fewFields, manyFields
};
private static Mode mode = Mode.fewFields;
// private static Mode mode = Mode.manyFields;
/**
* @param args
*/
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
System.out.println(ManagementFactory.getRuntimeMXBean().getName());
System.out.println("mode=" + mode);
long before = System.currentTimeMillis();
printOutMemory("Before starting");
File indexFolder = new File("C:\\temp\\index\\" + mode.toString());
directory = FSDirectory.open(new File("C:\\temp\\index\\" + mode.toString()));
analyzer = new WhitespaceAnalyzer();
parser = new QueryParser(Version.LUCENE_CURRENT, "content", analyzer);
if (indexFolder.exists()) {
openWriterOverExistingIndex();
} else {
createNewIndex();
}
printOutMemory("Before opening reader+searcher+running dummy query");
ireader = IndexReader.open(directory);
isearcher = new IndexSearcher(ireader);
Query query = parser.parse("name:a*");
System.out.println(query.rewrite(ireader));
ScoreDoc[] hits = isearcher.search(query, null, 1000000).scoreDocs;
printOutMemory("After opening reader+searcher+running dummy query");
printOutMemory("Before closing Lucene objects");
System.out.println("Hit enter key to continue...");
System.in.read();
ireader.close();
iwriter.close();
directory.close();
System.out.println();
System.out.println("Done. Runtime duration=" + (System.currentTimeMillis() - before) + "ms");
printOutMemory("After closing Lucene objects");
}
private static void printOutMemory(String prefixMessage) {
Runtime runtime = Runtime.getRuntime();
long beforeUsedMemory = runtime.totalMemory() - runtime.freeMemory();
for (int i = 0; i < 10; i++) {
System.gc();
}
try {
Thread.sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
for (int i = 0; i < 10; i++) {
System.gc();
}
long afterUsedMemory = runtime.totalMemory() - runtime.freeMemory();
System.out.println(prefixMessage + " - Used memory=" + (afterUsedMemory / 1024 / 1024) + "MB (beforeUsedMemory=" + beforeUsedMemory / 1024 / 1024
+ "MB)");
}
private static void openWriterOverExistingIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
System.out.println("Opening existing index");
long before = System.currentTimeMillis();
iwriter = new IndexWriter(directory, analyzer, false, IndexWriter.MaxFieldLength.UNLIMITED);
System.out.println("open existing index duration=" + (System.currentTimeMillis() - before) + "ms");
}
private static void createNewIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
System.out.println("Creating index from scratch");
iwriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
int numUniqueTerms = 100 * 1000;
String[] uniqueTerms = new String[numUniqueTerms];
for (int i = 0; i < uniqueTerms.length; i++) {
uniqueTerms[i] = String.valueOf(i);
}
int numUniqueFieldNames = 1 * 1000 * 1000;
String[] uniqueFieldNames = new String[numUniqueFieldNames];
for (int i = 0; i < uniqueFieldNames.length; i++) {
uniqueFieldNames[i] = "community_tag_" + UUID.randomUUID().toString();
}
int numOfDocs = 100 * 1000;
for (int i = 0; i < numOfDocs; i++) {
addNewDocument(numUniqueTerms, uniqueTerms, numUniqueFieldNames, uniqueFieldNames);
if (i % 1000 == 0) {
System.out.println("Progress: " + (100 * i / numOfDocs) + "% (wrote " + i + " documents)");
}
}
// release mem
uniqueTerms = null;
uniqueFieldNames = null;
printOutMemory("before commit()");
iwriter.commit();
printOutMemory("after commit()");
}
private static void addNewDocument(int numUniqueTerms, String[] uniqueTerms, int numUniqueFieldNames, String[] uniqueFieldNames)
throws CorruptIndexException, IOException {
Document doc = new Document();
for (int j = 0; j < 10; j++) {
String fieldName = (mode == Mode.fewFields) ? ("community_tag_" + j) : uniqueFieldNames[random.nextInt(numUniqueFieldNames)];
String fieldValue = getFieldValue(numUniqueTerms, uniqueTerms);
doc.add(new Field(fieldName, fieldValue, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
}
iwriter.addDocument(doc);
}
private static String getFieldValue(int numUniqueTerms, String[] uniqueTerms) {
int termsInField = random.nextInt(10);
StringBuilder sb = new StringBuilder();
for (int w = 0; w < termsInField; w++) {
sb.append(uniqueTerms[random.nextInt(numUniqueTerms)]).append(" ");
}
String fieldValue = sb.toString();
return fieldValue;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment