Skip to content

Instantly share code, notes, and snippets.

@sirupsen
Last active December 7, 2021 22:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sirupsen/0c1d388d94d9de611c54df866e6d1708 to your computer and use it in GitHub Desktop.
Save sirupsen/0c1d388d94d9de611c54df866e6d1708 to your computer and use it in GitHub Desktop.
/*
* This Kotlin source file was generated by the Gradle 'init' task.
*/
package lucene.napkin
import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.document.*
import org.apache.lucene.index.*
import org.apache.lucene.queries.intervals.Intervals.term
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser
import org.apache.lucene.search.*
import org.apache.lucene.store.Directory
import org.apache.lucene.store.MMapDirectory
import java.nio.file.Files
import java.nio.file.Path
import kotlin.random.Random
import kotlin.time.ExperimentalTime
import kotlin.time.TimeSource
import kotlin.time.measureTime
// https://storage.googleapis.com/pair-programming/search/products_big.json
@ExperimentalTime
fun createDocuments(writer: IndexWriter, number: Int) {
val clock = TimeSource.Monotonic
val mark = clock.markNow()
val names = (0..256).map { n -> StringField("name", "name$n", Field.Store.YES) }
val brands = (0..64).map { n -> StringField("brand", "brand$n", Field.Store.YES) }
val attributes = (0..256).map { n -> StringField("attribute", "attribute$n", Field.Store.YES) }
(0..number).forEach { index ->
if (index % 10_000 == 0) {
println("Inserted $index documents")
}
val doc = Document()
doc.add(names[Random.nextInt(names.size)])
doc.add(brands[Random.nextInt(brands.size)])
(0..32).forEach { _ -> doc.add(attributes[Random.nextInt(attributes.size)]) }
val price = Random.nextLong(0, 1000)
doc.add(LongPoint("price_point", price)) // range queries
doc.add(SortedNumericDocValuesField("price_doc", price)) // sorting
val inventoryQuantity = Random.nextLong(0, 1000)
doc.add(LongPoint("inventory_quantity_point", inventoryQuantity)) // range queries
doc.add(SortedNumericDocValuesField("inventory_quantity_doc", inventoryQuantity)) // sorting
writer.addDocument(doc)
}
writer.commit()
val elapsed = mark.elapsedNow()
println("Inserted $number of docs ($elapsed)")
}
@ExperimentalTime
fun query(searcher: IndexSearcher, query: Query): Array<ScoreDoc> {
val clock = TimeSource.Monotonic
var hits: Array<ScoreDoc>
val totalCount = searcher.count(query)
// Sorting in reverse is still slow, which means we need an index per...
val sort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG, false))
val searchDuration = clock.measureTime {
hits = searcher.search(ConstantScoreQuery(query), 100, sort).scoreDocs
}
println("query: `${query}`, hits: ${hits.size}, count: $totalCount (${searchDuration})")
return hits
}
@ExperimentalTime
fun main() {
val analyzer: Analyzer = StandardAnalyzer()
val path = Path.of("by_price")
println("Kotlin Version : ${KotlinVersion.CURRENT} ")
println("Java VM Version : ${System.getProperty("java.vm.version")} ")
var directory: Directory
if (Files.exists(path)) {
directory = MMapDirectory.open(path)
} else {
Files.createDirectory(path)
directory = MMapDirectory.open(path)
val config = IndexWriterConfig(analyzer)
config.useCompoundFile = false
// Another possible sort value would be to use a payload or norm. Using the BW-max (or whatever it's called)
// and the native scoring mechanisms, this might be plenty fast. Either the native scoring can be used, or, we
// can write a collector/scorer that takes that payload/norm into account and just picks up the maximum. This
// would mean we'd avoid a lot of random memory access as it'd be encoded with the fields.
// For deep pagination, we could just not allow certain sorting options past a certain limit, since only
// bots and scrapers will do this, not humans. Sorting doesn't matter to humans if you're on page 100, but it
// does matter to machines.
config.indexSort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG))
//config.mergePolicy = NoMergePolicy.INSTANCE // more segments = more concurrency
//config.ramBufferSizeMB = 4000.0
val writer: IndexWriter = IndexWriter(directory, config)
createDocuments(writer, 10_000_000)
}
val reader: DirectoryReader = DirectoryReader.open(directory)
//val searcher: IndexSearcher = IndexSearcher(reader, ScheduledThreadPoolExecutor(8))
val searcher = IndexSearcher(reader)
searcher.queryCache = null
val parser = StandardQueryParser(analyzer)
// Curiously, there's a cliff after `attribute3`. Presumably this is due to cache sizing?
val textQuery = parser.parse("attribute:attribute1 AND attribute:attribute2 AND attribute:attribute3", "name")
val booleanQuery = BooleanQuery.Builder()
booleanQuery.add(textQuery, BooleanClause.Occur.MUST)
// Query parser doesn't handle point queries well. contribution opportunity :)
// Sometimes it's faster to use doc values, sometimes it's faster to use points.
// The `IndexOrDocValuesQuery` will choose the appropriate one..
// https://www.elastic.co/blog/better-query-planning-for-range-queries-in-elasticsearch
// https://lucene.apache.org/core/8_6_3/core/index.html?overview-summary.html
val lowerValue: Long = 100
val upperValue: Long = 500
// For the napkin math blog post, showing why this matters would be great.
val pointQuery = LongPoint.newRangeQuery("price_point", lowerValue, upperValue)
val dvQuery = SortedNumericDocValuesField.newSlowRangeQuery("price_doc", lowerValue, upperValue)
// booleanQuery.add(IndexOrDocValuesQuery(pointQuery, dvQuery), BooleanClause.Occur.MUST)
val query = booleanQuery.build()
var hits: Array<ScoreDoc> = emptyArray();
(1..10).forEach() { _ ->
hits = query(searcher, query)
}
println("Document ids: ${hits.map { it.doc }.joinToString()}")
val status = CheckIndex(directory).checkIndex()
//val postingStatus = CheckIndex(directory).
reader.close()
directory.close()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment