/* | |
* This Kotlin source file was generated by the Gradle 'init' task. | |
*/ | |
package lucene.napkin | |
import org.apache.lucene.analysis.Analyzer | |
import org.apache.lucene.analysis.standard.StandardAnalyzer | |
import org.apache.lucene.document.* | |
import org.apache.lucene.index.* | |
import org.apache.lucene.queries.intervals.Intervals.term | |
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser | |
import org.apache.lucene.search.* | |
import org.apache.lucene.store.Directory | |
import org.apache.lucene.store.MMapDirectory | |
import java.nio.file.Files | |
import java.nio.file.Path | |
import kotlin.random.Random | |
import kotlin.time.ExperimentalTime | |
import kotlin.time.TimeSource | |
import kotlin.time.measureTime | |
// https://storage.googleapis.com/pair-programming/search/products_big.json | |
@ExperimentalTime | |
fun createDocuments(writer: IndexWriter, number: Int) { | |
val clock = TimeSource.Monotonic | |
val mark = clock.markNow() | |
val names = (0..256).map { n -> StringField("name", "name$n", Field.Store.YES) } | |
val brands = (0..64).map { n -> StringField("brand", "brand$n", Field.Store.YES) } | |
val attributes = (0..256).map { n -> StringField("attribute", "attribute$n", Field.Store.YES) } | |
(0..number).forEach { index -> | |
if (index % 10_000 == 0) { | |
println("Inserted $index documents") | |
} | |
val doc = Document() | |
doc.add(names[Random.nextInt(names.size)]) | |
doc.add(brands[Random.nextInt(brands.size)]) | |
(0..32).forEach { _ -> doc.add(attributes[Random.nextInt(attributes.size)]) } | |
val price = Random.nextLong(0, 1000) | |
doc.add(LongPoint("price_point", price)) // range queries | |
doc.add(SortedNumericDocValuesField("price_doc", price)) // sorting | |
val inventoryQuantity = Random.nextLong(0, 1000) | |
doc.add(LongPoint("inventory_quantity_point", inventoryQuantity)) // range queries | |
doc.add(SortedNumericDocValuesField("inventory_quantity_doc", inventoryQuantity)) // sorting | |
writer.addDocument(doc) | |
} | |
writer.commit() | |
val elapsed = mark.elapsedNow() | |
println("Inserted $number of docs ($elapsed)") | |
} | |
@ExperimentalTime | |
fun query(searcher: IndexSearcher, query: Query): Array<ScoreDoc> { | |
val clock = TimeSource.Monotonic | |
var hits: Array<ScoreDoc> | |
val totalCount = searcher.count(query) | |
// Sorting in reverse is still slow, which means we need an index per... | |
val sort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG, false)) | |
val searchDuration = clock.measureTime { | |
hits = searcher.search(ConstantScoreQuery(query), 100, sort).scoreDocs | |
} | |
println("query: `${query}`, hits: ${hits.size}, count: $totalCount (${searchDuration})") | |
return hits | |
} | |
@ExperimentalTime | |
fun main() { | |
val analyzer: Analyzer = StandardAnalyzer() | |
val path = Path.of("by_price") | |
println("Kotlin Version : ${KotlinVersion.CURRENT} ") | |
println("Java VM Version : ${System.getProperty("java.vm.version")} ") | |
var directory: Directory | |
if (Files.exists(path)) { | |
directory = MMapDirectory.open(path) | |
} else { | |
Files.createDirectory(path) | |
directory = MMapDirectory.open(path) | |
val config = IndexWriterConfig(analyzer) | |
config.useCompoundFile = false | |
// Another possible sort value would be to use a payload or norm. Using the BW-max (or whatever it's called) | |
// and the native scoring mechanisms, this might be plenty fast. Either the native scoring can be used, or, we | |
// can write a collector/scorer that takes that payload/norm into account and just picks up the maximum. This | |
// would mean we'd avoid a lot of random memory access as it'd be encoded with the fields. | |
// For deep pagination, we could just not allow certain sorting options past a certain limit, since only | |
// bots and scrapers will do this, not humans. Sorting doesn't matter to humans if you're on page 100, but it | |
// does matter to machines. | |
config.indexSort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG)) | |
//config.mergePolicy = NoMergePolicy.INSTANCE // more segments = more concurrency | |
//config.ramBufferSizeMB = 4000.0 | |
val writer: IndexWriter = IndexWriter(directory, config) | |
createDocuments(writer, 10_000_000) | |
} | |
val reader: DirectoryReader = DirectoryReader.open(directory) | |
//val searcher: IndexSearcher = IndexSearcher(reader, ScheduledThreadPoolExecutor(8)) | |
val searcher = IndexSearcher(reader) | |
searcher.queryCache = null | |
val parser = StandardQueryParser(analyzer) | |
// Curiously, there's a cliff after `attribute3`. Presumably this is due to cache sizing? | |
val textQuery = parser.parse("attribute:attribute1 AND attribute:attribute2 AND attribute:attribute3", "name") | |
val booleanQuery = BooleanQuery.Builder() | |
booleanQuery.add(textQuery, BooleanClause.Occur.MUST) | |
// Query parser doesn't handle point queries well. contribution opportunity :) | |
// Sometimes it's faster to use doc values, sometimes it's faster to use points. | |
// The `IndexOrDocValuesQuery` will choose the appropriate one.. | |
// https://www.elastic.co/blog/better-query-planning-for-range-queries-in-elasticsearch | |
// https://lucene.apache.org/core/8_6_3/core/index.html?overview-summary.html | |
val lowerValue: Long = 100 | |
val upperValue: Long = 500 | |
// For the napkin math blog post, showing why this matters would be great. | |
val pointQuery = LongPoint.newRangeQuery("price_point", lowerValue, upperValue) | |
val dvQuery = SortedNumericDocValuesField.newSlowRangeQuery("price_doc", lowerValue, upperValue) | |
// booleanQuery.add(IndexOrDocValuesQuery(pointQuery, dvQuery), BooleanClause.Occur.MUST) | |
val query = booleanQuery.build() | |
var hits: Array<ScoreDoc> = emptyArray(); | |
(1..10).forEach() { _ -> | |
hits = query(searcher, query) | |
} | |
println("Document ids: ${hits.map { it.doc }.joinToString()}") | |
val status = CheckIndex(directory).checkIndex() | |
//val postingStatus = CheckIndex(directory). | |
reader.close() | |
directory.close() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment