-
-
Save sirupsen/0c1d388d94d9de611c54df866e6d1708 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This Kotlin source file was generated by the Gradle 'init' task. | |
*/ | |
package lucene.napkin | |
import org.apache.lucene.analysis.Analyzer | |
import org.apache.lucene.analysis.standard.StandardAnalyzer | |
import org.apache.lucene.document.* | |
import org.apache.lucene.index.* | |
import org.apache.lucene.queries.intervals.Intervals.term | |
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser | |
import org.apache.lucene.search.* | |
import org.apache.lucene.store.Directory | |
import org.apache.lucene.store.MMapDirectory | |
import java.nio.file.Files | |
import java.nio.file.Path | |
import kotlin.random.Random | |
import kotlin.time.ExperimentalTime | |
import kotlin.time.TimeSource | |
import kotlin.time.measureTime | |
// https://storage.googleapis.com/pair-programming/search/products_big.json | |
@ExperimentalTime | |
fun createDocuments(writer: IndexWriter, number: Int) { | |
val clock = TimeSource.Monotonic | |
val mark = clock.markNow() | |
val names = (0..256).map { n -> StringField("name", "name$n", Field.Store.YES) } | |
val brands = (0..64).map { n -> StringField("brand", "brand$n", Field.Store.YES) } | |
val attributes = (0..256).map { n -> StringField("attribute", "attribute$n", Field.Store.YES) } | |
(0..number).forEach { index -> | |
if (index % 10_000 == 0) { | |
println("Inserted $index documents") | |
} | |
val doc = Document() | |
doc.add(names[Random.nextInt(names.size)]) | |
doc.add(brands[Random.nextInt(brands.size)]) | |
(0..32).forEach { _ -> doc.add(attributes[Random.nextInt(attributes.size)]) } | |
val price = Random.nextLong(0, 1000) | |
doc.add(LongPoint("price_point", price)) // range queries | |
doc.add(SortedNumericDocValuesField("price_doc", price)) // sorting | |
val inventoryQuantity = Random.nextLong(0, 1000) | |
doc.add(LongPoint("inventory_quantity_point", inventoryQuantity)) // range queries | |
doc.add(SortedNumericDocValuesField("inventory_quantity_doc", inventoryQuantity)) // sorting | |
writer.addDocument(doc) | |
} | |
writer.commit() | |
val elapsed = mark.elapsedNow() | |
println("Inserted $number of docs ($elapsed)") | |
} | |
@ExperimentalTime | |
fun query(searcher: IndexSearcher, query: Query): Array<ScoreDoc> { | |
val clock = TimeSource.Monotonic | |
var hits: Array<ScoreDoc> | |
val totalCount = searcher.count(query) | |
// Sorting in reverse is still slow, which means we need an index per... | |
val sort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG, false)) | |
val searchDuration = clock.measureTime { | |
hits = searcher.search(ConstantScoreQuery(query), 100, sort).scoreDocs | |
} | |
println("query: `${query}`, hits: ${hits.size}, count: $totalCount (${searchDuration})") | |
return hits | |
} | |
@ExperimentalTime | |
fun main() { | |
val analyzer: Analyzer = StandardAnalyzer() | |
val path = Path.of("by_price") | |
println("Kotlin Version : ${KotlinVersion.CURRENT} ") | |
println("Java VM Version : ${System.getProperty("java.vm.version")} ") | |
var directory: Directory | |
if (Files.exists(path)) { | |
directory = MMapDirectory.open(path) | |
} else { | |
Files.createDirectory(path) | |
directory = MMapDirectory.open(path) | |
val config = IndexWriterConfig(analyzer) | |
config.useCompoundFile = false | |
// Another possible sort value would be to use a payload or norm. Using the BW-max (or whatever it's called) | |
// and the native scoring mechanisms, this might be plenty fast. Either the native scoring can be used, or, we | |
// can write a collector/scorer that takes that payload/norm into account and just picks up the maximum. This | |
// would mean we'd avoid a lot of random memory access as it'd be encoded with the fields. | |
// For deep pagination, we could just not allow certain sorting options past a certain limit, since only | |
// bots and scrapers will do this, not humans. Sorting doesn't matter to humans if you're on page 100, but it | |
// does matter to machines. | |
config.indexSort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG)) | |
//config.mergePolicy = NoMergePolicy.INSTANCE // more segments = more concurrency | |
//config.ramBufferSizeMB = 4000.0 | |
val writer: IndexWriter = IndexWriter(directory, config) | |
createDocuments(writer, 10_000_000) | |
} | |
val reader: DirectoryReader = DirectoryReader.open(directory) | |
//val searcher: IndexSearcher = IndexSearcher(reader, ScheduledThreadPoolExecutor(8)) | |
val searcher = IndexSearcher(reader) | |
searcher.queryCache = null | |
val parser = StandardQueryParser(analyzer) | |
// Curiously, there's a cliff after `attribute3`. Presumably this is due to cache sizing? | |
val textQuery = parser.parse("attribute:attribute1 AND attribute:attribute2 AND attribute:attribute3", "name") | |
val booleanQuery = BooleanQuery.Builder() | |
booleanQuery.add(textQuery, BooleanClause.Occur.MUST) | |
// Query parser doesn't handle point queries well. contribution opportunity :) | |
// Sometimes it's faster to use doc values, sometimes it's faster to use points. | |
// The `IndexOrDocValuesQuery` will choose the appropriate one.. | |
// https://www.elastic.co/blog/better-query-planning-for-range-queries-in-elasticsearch | |
// https://lucene.apache.org/core/8_6_3/core/index.html?overview-summary.html | |
val lowerValue: Long = 100 | |
val upperValue: Long = 500 | |
// For the napkin math blog post, showing why this matters would be great. | |
val pointQuery = LongPoint.newRangeQuery("price_point", lowerValue, upperValue) | |
val dvQuery = SortedNumericDocValuesField.newSlowRangeQuery("price_doc", lowerValue, upperValue) | |
// booleanQuery.add(IndexOrDocValuesQuery(pointQuery, dvQuery), BooleanClause.Occur.MUST) | |
val query = booleanQuery.build() | |
var hits: Array<ScoreDoc> = emptyArray(); | |
(1..10).forEach() { _ -> | |
hits = query(searcher, query) | |
} | |
println("Document ids: ${hits.map { it.doc }.joinToString()}") | |
val status = CheckIndex(directory).checkIndex() | |
//val postingStatus = CheckIndex(directory). | |
reader.close() | |
directory.close() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment