Skip to content

Instantly share code, notes, and snippets.

@d2a-raudenaerde
Last active January 23, 2021 07:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save d2a-raudenaerde/93a490e5b0d17b2fa88862473429aeb3 to your computer and use it in GitHub Desktop.
Save d2a-raudenaerde/93a490e5b0d17b2fa88862473429aeb3 to your computer and use it in GitHub Desktop.
lucene-docs-retrieval-bench
package org.audenaerde.lucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NRTCachingDirectory;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Random;
import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Benchmark)
@Fork(value = 2, jvmArgs = {"-Xms2G", "-Xmx2G"})
@Warmup(iterations = 2)
@Measurement(iterations = 4)
public class DocRetrievalBenchmark {
IndexSearcher searcher;
Random random;
public void createIndex(int version) throws IOException, NoSuchAlgorithmException
{
File f = new File("/tmp/bench" + version);
if (!f.exists()) {
Directory index = createDirectory(f);
IndexWriterConfig config = new IndexWriterConfig();
config.setIndexDeletionPolicy(new PersistentSnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy(), index));
IndexWriter writer = new IndexWriter(index, config);
String value2 = "Wikipedia is hosted by the Wikimedia Foundation, a non-profit organization that also hosts a range of other projects.";
String value3 = "Save your favorite articles to read offline, sync your reading lists across devices and customize your reading experience with the official Wikipedia app.";
MessageDigest digest = MessageDigest.getInstance("MD5");
for (int i = 0; i < 1_000_000; i++) {
digest.reset();
digest.update(Integer.toString(i).getBytes());
String value1 = new BigInteger(1, digest.digest()).toString(16);
writer.addDocument(createDocument(i, value1, value2, value3));
if (i % 10_000 == 0) {
System.out.println(i);
}
}
writer.commit();
index.close();
}
random = new Random(123);
searcher = new IndexSearcher(DirectoryReader.open(createDirectory(f)));
}
private static Document createDocument(int i, String value1, String value2, String value3)
{
Document document = new Document();
document.add(new StringField("field1", value1, Field.Store.YES));
document.add(new StringField("field2", value2 + i, Field.Store.YES));
document.add(new StringField("field3", value3 + i, Field.Store.YES));
document.add(new StringField("field4", String.valueOf(i % 1000), Field.Store.YES));
return document;
}
static Directory createDirectory(File file) throws IOException
{
return new NRTCachingDirectory(FSDirectory.open(file.toPath()), 1.0, 10.0);
}
public static void main(String[] args) throws RunnerException {
Options opt = new OptionsBuilder()
.include(DocRetrievalBenchmark.class.getSimpleName())
.forks(1)
.build();
new Runner(opt).run();
}
@Setup
public void setup() {
try {
createIndex(87);
} catch (IOException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
}
@Benchmark
public void retrieveDocuments(Blackhole bh) throws IOException {
//We find 1000 random docs. Assumed is that searching is a lot faster than retrieval.
TopDocs topDocs = searcher.search(new TermQuery(new Term("field4", String.valueOf(random.nextInt(1000)))), Integer.MAX_VALUE);
ScoreDoc[] docs = topDocs.scoreDocs;
for (ScoreDoc doc : docs)
{
bh.consume(searcher.doc(doc.doc));
}
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>lucene-bench-docs</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>run-benchmarks</id>
<phase>integration-test</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<classpathScope>test</classpathScope>
<executable>java</executable>
<arguments>
<argument>-classpath</argument>
<classpath/>
<argument>org.openjdk.jmh.Main</argument>
<argument>.*</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>8.7.0</version>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-core</artifactId>
<version>1.21</version>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-generator-annprocess</artifactId>
<version>1.21</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.lucene</groupId>-->
<!-- <artifactId>lucene-core</artifactId>-->
<!-- <version>7.5.0</version>-->
<!-- </dependency>-->
</dependencies>
</project>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment