sirupsen/napkin_problem_13.kt Secret

## napkin_problem_13.kt
/*
 * This Kotlin source file was generated by the Gradle 'init' task.
 */
package lucene.napkin

import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.document.*
import org.apache.lucene.index.*
import org.apache.lucene.queries.intervals.Intervals.term
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser
import org.apache.lucene.search.*
import org.apache.lucene.store.Directory
import org.apache.lucene.store.MMapDirectory
import java.nio.file.Files
import java.nio.file.Path
import kotlin.random.Random
import kotlin.time.ExperimentalTime
import kotlin.time.TimeSource
import kotlin.time.measureTime


// https://storage.googleapis.com/pair-programming/search/products_big.json

@ExperimentalTime
fun createDocuments(writer: IndexWriter, number: Int) {
    val clock = TimeSource.Monotonic
    val mark = clock.markNow()

    val names = (0..256).map { n -> StringField("name", "name$n", Field.Store.YES) }
    val brands = (0..64).map { n -> StringField("brand", "brand$n", Field.Store.YES) }
    val attributes = (0..256).map { n -> StringField("attribute", "attribute$n", Field.Store.YES) }

    (0..number).forEach { index ->
        if (index % 10_000 == 0) {
            println("Inserted $index documents")
        }
        val doc = Document()

        doc.add(names[Random.nextInt(names.size)])
        doc.add(brands[Random.nextInt(brands.size)])
        (0..32).forEach { _ -> doc.add(attributes[Random.nextInt(attributes.size)]) }

        val price = Random.nextLong(0, 1000)
        doc.add(LongPoint("price_point", price)) // range queries
        doc.add(SortedNumericDocValuesField("price_doc", price)) // sorting

        val inventoryQuantity = Random.nextLong(0, 1000)
        doc.add(LongPoint("inventory_quantity_point", inventoryQuantity)) // range queries
        doc.add(SortedNumericDocValuesField("inventory_quantity_doc", inventoryQuantity)) // sorting

        writer.addDocument(doc)
    }

    writer.commit()
    val elapsed = mark.elapsedNow()
    println("Inserted $number of docs ($elapsed)")
}

@ExperimentalTime
fun query(searcher: IndexSearcher, query: Query): Array<ScoreDoc> {
    val clock = TimeSource.Monotonic
    var hits: Array<ScoreDoc>

    val totalCount = searcher.count(query)
    // Sorting in reverse is still slow, which means we need an index per...
    val sort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG, false))
    val searchDuration = clock.measureTime {
        hits = searcher.search(ConstantScoreQuery(query), 100, sort).scoreDocs
    }
    println("query: `${query}`, hits: ${hits.size}, count: $totalCount (${searchDuration})")

    return hits
}

@ExperimentalTime
fun main() {
    val analyzer: Analyzer = StandardAnalyzer()
    val path = Path.of("by_price")
    println("Kotlin Version  : ${KotlinVersion.CURRENT} ")
    println("Java VM Version  : ${System.getProperty("java.vm.version")} ")

    var directory: Directory
    if (Files.exists(path)) {
        directory = MMapDirectory.open(path)
    } else {
        Files.createDirectory(path)
        directory = MMapDirectory.open(path)
        val config = IndexWriterConfig(analyzer)
        config.useCompoundFile = false
        // Another possible sort value would be to use a payload or norm. Using the BW-max (or whatever it's called)
        // and the native scoring mechanisms, this might be plenty fast. Either the native scoring can be used, or, we
        // can write a collector/scorer that takes that payload/norm into account and just picks up the maximum. This
        // would mean we'd avoid a lot of random memory access as it'd be encoded with the fields.
        // For deep pagination, we could just not allow certain sorting options past a certain limit, since only
        // bots and scrapers will do this, not humans. Sorting doesn't matter to humans if you're on page 100, but it
        // does matter to machines.
        config.indexSort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG))
        //config.mergePolicy = NoMergePolicy.INSTANCE // more segments = more concurrency
        //config.ramBufferSizeMB = 4000.0
        val writer: IndexWriter  = IndexWriter(directory, config)
        createDocuments(writer, 10_000_000)
    }

    val reader: DirectoryReader = DirectoryReader.open(directory)
    //val searcher: IndexSearcher = IndexSearcher(reader, ScheduledThreadPoolExecutor(8))
    val searcher = IndexSearcher(reader)
    searcher.queryCache = null

    val parser = StandardQueryParser(analyzer)
    // Curiously, there's a cliff after `attribute3`. Presumably this is due to cache sizing?
    val textQuery = parser.parse("attribute:attribute1 AND attribute:attribute2 AND attribute:attribute3", "name")
    val booleanQuery = BooleanQuery.Builder()
    booleanQuery.add(textQuery, BooleanClause.Occur.MUST)
    // Query parser doesn't handle point queries well. contribution opportunity :)
    // Sometimes it's faster to use doc values, sometimes it's faster to use points.
    // The `IndexOrDocValuesQuery` will choose the appropriate one..
    // https://www.elastic.co/blog/better-query-planning-for-range-queries-in-elasticsearch
    // https://lucene.apache.org/core/8_6_3/core/index.html?overview-summary.html
    val lowerValue: Long = 100
    val upperValue: Long = 500
    // For the napkin math blog post, showing why this matters would be great.
    val pointQuery = LongPoint.newRangeQuery("price_point", lowerValue, upperValue)
    val dvQuery = SortedNumericDocValuesField.newSlowRangeQuery("price_doc", lowerValue, upperValue)
    // booleanQuery.add(IndexOrDocValuesQuery(pointQuery, dvQuery), BooleanClause.Occur.MUST)

    val query = booleanQuery.build()
    var hits: Array<ScoreDoc> = emptyArray();
    (1..10).forEach() { _ ->
        hits = query(searcher, query)
    }

    println("Document ids: ${hits.map { it.doc }.joinToString()}")

    val status = CheckIndex(directory).checkIndex()
    //val postingStatus = CheckIndex(directory).

    reader.close()
    directory.close()
}
	/*
	* This Kotlin source file was generated by the Gradle 'init' task.
	*/
	package lucene.napkin

	import org.apache.lucene.analysis.Analyzer
	import org.apache.lucene.analysis.standard.StandardAnalyzer
	import org.apache.lucene.document.*
	import org.apache.lucene.index.*
	import org.apache.lucene.queries.intervals.Intervals.term
	import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser
	import org.apache.lucene.search.*
	import org.apache.lucene.store.Directory
	import org.apache.lucene.store.MMapDirectory
	import java.nio.file.Files
	import java.nio.file.Path
	import kotlin.random.Random
	import kotlin.time.ExperimentalTime
	import kotlin.time.TimeSource
	import kotlin.time.measureTime


	// https://storage.googleapis.com/pair-programming/search/products_big.json

	@ExperimentalTime
	fun createDocuments(writer: IndexWriter, number: Int) {
	val clock = TimeSource.Monotonic
	val mark = clock.markNow()

	val names = (0..256).map { n -> StringField("name", "name$n", Field.Store.YES) }
	val brands = (0..64).map { n -> StringField("brand", "brand$n", Field.Store.YES) }
	val attributes = (0..256).map { n -> StringField("attribute", "attribute$n", Field.Store.YES) }

	(0..number).forEach { index ->
	if (index % 10_000 == 0) {
	println("Inserted $index documents")
	}
	val doc = Document()

	doc.add(names[Random.nextInt(names.size)])
	doc.add(brands[Random.nextInt(brands.size)])
	(0..32).forEach { _ -> doc.add(attributes[Random.nextInt(attributes.size)]) }

	val price = Random.nextLong(0, 1000)
	doc.add(LongPoint("price_point", price)) // range queries
	doc.add(SortedNumericDocValuesField("price_doc", price)) // sorting

	val inventoryQuantity = Random.nextLong(0, 1000)
	doc.add(LongPoint("inventory_quantity_point", inventoryQuantity)) // range queries
	doc.add(SortedNumericDocValuesField("inventory_quantity_doc", inventoryQuantity)) // sorting

	writer.addDocument(doc)
	}

	writer.commit()
	val elapsed = mark.elapsedNow()
	println("Inserted $number of docs ($elapsed)")
	}

	@ExperimentalTime
	fun query(searcher: IndexSearcher, query: Query): Array<ScoreDoc> {
	val clock = TimeSource.Monotonic
	var hits: Array<ScoreDoc>

	val totalCount = searcher.count(query)
	// Sorting in reverse is still slow, which means we need an index per...
	val sort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG, false))
	val searchDuration = clock.measureTime {
	hits = searcher.search(ConstantScoreQuery(query), 100, sort).scoreDocs
	}
	println("query: `${query}`, hits: ${hits.size}, count: $totalCount (${searchDuration})")

	return hits
	}

	@ExperimentalTime
	fun main() {
	val analyzer: Analyzer = StandardAnalyzer()
	val path = Path.of("by_price")
	println("Kotlin Version : ${KotlinVersion.CURRENT} ")
	println("Java VM Version : ${System.getProperty("java.vm.version")} ")

	var directory: Directory
	if (Files.exists(path)) {
	directory = MMapDirectory.open(path)
	} else {
	Files.createDirectory(path)
	directory = MMapDirectory.open(path)
	val config = IndexWriterConfig(analyzer)
	config.useCompoundFile = false
	// Another possible sort value would be to use a payload or norm. Using the BW-max (or whatever it's called)
	// and the native scoring mechanisms, this might be plenty fast. Either the native scoring can be used, or, we
	// can write a collector/scorer that takes that payload/norm into account and just picks up the maximum. This
	// would mean we'd avoid a lot of random memory access as it'd be encoded with the fields.
	// For deep pagination, we could just not allow certain sorting options past a certain limit, since only
	// bots and scrapers will do this, not humans. Sorting doesn't matter to humans if you're on page 100, but it
	// does matter to machines.
	config.indexSort = Sort(SortedNumericSortField("price_doc", SortField.Type.LONG))
	//config.mergePolicy = NoMergePolicy.INSTANCE // more segments = more concurrency
	//config.ramBufferSizeMB = 4000.0
	val writer: IndexWriter = IndexWriter(directory, config)
	createDocuments(writer, 10_000_000)
	}

	val reader: DirectoryReader = DirectoryReader.open(directory)
	//val searcher: IndexSearcher = IndexSearcher(reader, ScheduledThreadPoolExecutor(8))
	val searcher = IndexSearcher(reader)
	searcher.queryCache = null

	val parser = StandardQueryParser(analyzer)
	// Curiously, there's a cliff after `attribute3`. Presumably this is due to cache sizing?
	val textQuery = parser.parse("attribute:attribute1 AND attribute:attribute2 AND attribute:attribute3", "name")
	val booleanQuery = BooleanQuery.Builder()
	booleanQuery.add(textQuery, BooleanClause.Occur.MUST)
	// Query parser doesn't handle point queries well. contribution opportunity :)
	// Sometimes it's faster to use doc values, sometimes it's faster to use points.
	// The `IndexOrDocValuesQuery` will choose the appropriate one..
	// https://www.elastic.co/blog/better-query-planning-for-range-queries-in-elasticsearch
	// https://lucene.apache.org/core/8_6_3/core/index.html?overview-summary.html
	val lowerValue: Long = 100
	val upperValue: Long = 500
	// For the napkin math blog post, showing why this matters would be great.
	val pointQuery = LongPoint.newRangeQuery("price_point", lowerValue, upperValue)
	val dvQuery = SortedNumericDocValuesField.newSlowRangeQuery("price_doc", lowerValue, upperValue)
	// booleanQuery.add(IndexOrDocValuesQuery(pointQuery, dvQuery), BooleanClause.Occur.MUST)

	val query = booleanQuery.build()
	var hits: Array<ScoreDoc> = emptyArray();
	(1..10).forEach() { _ ->
	hits = query(searcher, query)
	}

	println("Document ids: ${hits.map { it.doc }.joinToString()}")

	val status = CheckIndex(directory).checkIndex()
	//val postingStatus = CheckIndex(directory).

	reader.close()
	directory.close()
	}