Skip to content

Instantly share code, notes, and snippets.

@tkovis
Created November 4, 2021 10:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tkovis/e1dfde9df1c20dc11e08f51d8a12d08e to your computer and use it in GitHub Desktop.
Save tkovis/e1dfde9df1c20dc11e08f51d8a12d08e to your computer and use it in GitHub Desktop.
xtdb aggregate querying
(ns repro
(:require
[xtdb.api :as xt]
[xtdb.query]))
(comment
'{com.xtdb/xtdb-core {:mvn/version "1.19.0-beta1"}
com.xtdb/xtdb-rocksdb {:mvn/version "1.19.0-beta1"}
com.xtdb/xtdb-lucene {:mvn/version "1.19.0-beta1"}})
(def node (xt/start-node
'{:xtdb/index-store {:kv-store
{:xtdb/module xtdb.rocksdb/->kv-store
:db-dir "repro/rocksdb/index"}
:checkpointer {:xtdb/module xtdb.checkpoint/->checkpointer
:store {:xtdb/module xtdb.checkpoint/->filesystem-checkpoint-store
:path "repro/xtdb-checkpoint"}
:approx-frequency "PT1H"}}
:xtdb/document-store {:kv-store
{:xtdb/module xtdb.rocksdb/->kv-store
:db-dir "repro/rocksdb/document"}}
:xtdb/tx-log {:kv-store
{:xtdb/module xtdb.rocksdb/->kv-store
:db-dir "repro/rocksdb/tx"}}
:xtdb.lucene/lucene-store {:db-dir "repro/lucene"
:indexer xtdb.lucene.multi-field/->indexer}}))
(defn ->uuid [_] (java.util.UUID/randomUUID))
(def parent-atom (atom []))
(def c1-atom (atom []))
(def c2-atom (atom []))
(def c3-atom (atom []))
(def c4-atom (atom []))
(defn c3->c4 [db eid]
(let [c3 (xt/entity db eid)]
(when (not (:c3/must-not-have c3))
(:c3/c4 c3))))
(defn c3->c4-v2 [db c3]
(when (not (ffirst (xt/q db `{:find [must-not-have]
:in [c3]
:where [[c3 :c3/must-not-have must-not-have]]}
c3)))
(ffirst (xt/q db `{:find [c4]
:in [c3]
:where [[c3 :c3/c4 c4]]}
c3))))
(defmethod xtdb.query/aggregate 'count-c4 [_]
(fn
([] 0)
([acc] acc)
([acc eid]
(let [c4 (xt/entity (xt/db node) eid)]
(if (and (:c4/must-have c4)
(not (:c4/must-not-have c4)))
(inc acc)
acc)))))
(comment
; explicit nils
(future
(doseq [parent (->> (range) (take 10) (map ->uuid))]
(swap! parent-atom conj [::xt/put {:xt/id parent
:parent/string (str (hash parent))}]))
(doseq [c4 (->> (range) (take 40000) (map ->uuid))]
(let [must-have (< (rand) 0.5)
must-not-have (and must-have (< (rand) 0.1))]
(swap! c4-atom conj [::xt/put {:xt/id c4
:c4/must-have (or must-have nil)
:c4/must-not-have (or must-not-have nil)
:c4/string (str (hash c4))}])))
(doseq [c2 (->> (range) (take 3000) (map ->uuid))]
(swap! c2-atom conj [::xt/put {:xt/id c2
:c2/string (str (hash c2))}]))
(let [parents @parent-atom
c2s @c2-atom]
(doseq [c1 (->> (range) (take 5000) (map ->uuid))]
(swap! c1-atom conj [::xt/put {:xt/id c1
:c1/parent (get-in (rand-nth parents) [1 :xt/id])
:c1/c2 (get-in (rand-nth c2s) [1 :xt/id])
:c1/string (str (hash c1))}])))
(let [c2s @c2-atom
c4s @c4-atom]
(doseq [c3 (->> (range) (take 60000) (map ->uuid))]
(swap! c3-atom conj [::xt/put {:xt/id c3
:c3/must-not-have (or (< (rand) 0.2) nil)
:c3/c2 (get-in (rand-nth c2s) [1 :xt/id])
:c3/c4 (get-in (rand-nth c4s) [1 :xt/id])
:c3/string (str (hash c3))}])))
(->> (concat @parent-atom @c1-atom @c2-atom @c3-atom @c4-atom)
(partition-all 500)
(map (fn [batch]
(xt/submit-tx node (vec batch))))
last
(xt/await-tx node))
(println "done"))
(xt/attribute-stats node)
; vanilla
(time
(dotimes [_ 10]
(time
(xt/q (xt/db node) '{:find [parent (count c4)]
:where [[c1 :c1/parent parent]
[c1 :c1/c2 c2]
[c3 :c3/c2 c2]
[c3 :c3/must-not-have nil]
[c3 :c3/c4 c4]
[c4 :c4/must-have must-have]
[(identity must-have)]
[c4 :c4/must-not-have nil]]
:order-by [[(count c4) :desc]]
:offset 0
:limit 10}))))
; ~ 4480ms / query
; custom aggregate
(time
(dotimes [_ 10]
(time
(xt/q (xt/db node) '{:find [parent (count-c4 c4)]
:where [[c1 :c1/parent parent]
[c1 :c1/c2 c2]
[c3 :c3/c2 c2]
[c3 :c3/must-not-have nil]
[c3 :c3/c4 c4]]
:order-by [[(count-c4 c4) :desc]]
:offset 0
:limit 10}))))
; ~ 8170ms / query
; custom aggregate + custom predicate
(time
(dotimes [_ 10]
(time
(xt/q (xt/db node) '{:find [parent (count-c4 c4)]
:where [[c1 :c1/parent parent]
[c1 :c1/c2 c2]
[c3 :c3/c2 c2]
[(repro/c3->c4 $ c3) c4]]
:offset 0
:limit 10}))))
; ~ 9540ms / query
(.close node) ; remove repro db dir and restart node
; implicit nils
(future
(->> (concat @parent-atom @c1-atom @c2-atom @c3-atom @c4-atom)
(map (fn [[tx doc]]
[tx (into {} (filter second doc))]))
(partition-all 500)
(map (fn [batch]
(xt/submit-tx node (vec batch))))
last
(xt/await-tx node))
(println "done"))
(xt/attribute-stats node)
; vanilla
(time
(dotimes [_ 10]
(time
(xt/q (xt/db node) '{:find [parent (count c4)]
:where [[c1 :c1/parent parent]
[c1 :c1/c2 c2]
[c3 :c3/c2 c2]
(not-join [c3] [c3 :c3/must-not-have])
[c3 :c3/c4 c4]
[c4 :c4/must-have must-have]
(not-join [c4] [c4 :c4/must-not-have])]
:order-by [[(count c4) :desc]]
:offset 0
:limit 10}))))
; ~ 4640ms / query
; custom aggregate
(time
(dotimes [_ 10]
(time
(xt/q (xt/db node) '{:find [parent (count-c4 c4)]
:where [[c1 :c1/parent parent]
[c1 :c1/c2 c2]
[c3 :c3/c2 c2]
(not-join [c3] [c3 :c3/must-not-have])
[c3 :c3/c4 c4]]
:order-by [[(count-c4 c4) :desc]]
:offset 0
:limit 10}))))
; ~ 11680ms / query
; custom predicate
(time
(dotimes [_ 10]
(time
(xt/q (xt/db node) '{:find [parent (count c4)]
:where [[c1 :c1/parent parent]
[c1 :c1/c2 c2]
[c3 :c3/c2 c2]
[(repro/c3->c4 $ c3) c4]
[c4 :c4/must-have must-have]
(not-join [c4] [c4 :c4/must-not-have])]
:order-by [[(count c4) :desc]]
:offset 0
:limit 10}))))
; timeout
; custom aggregate + custom predicate
(time
(dotimes [_ 10]
(time
(xt/q (xt/db node) '{:find [parent (count-c4 c4)]
:where [[c1 :c1/parent parent]
[c1 :c1/c2 c2]
[c3 :c3/c2 c2]
[(repro/c3->c4 $ c3) c4]]
:order-by [[(count-c4 c4) :desc]]
:offset 0
:limit 10}))))
; ~ 9650ms / query
; custom aggregate + custom predicate v2
(time
(dotimes [_ 10]
(time
(xt/q (xt/db node) '{:find [parent (count-c4 c4)]
:where [[c1 :c1/parent parent]
[c1 :c1/c2 c2]
[c3 :c3/c2 c2]
[(repro/c3->c4-v2 $ c3) c4]]
:order-by [[(count-c4 c4) :desc]]
:offset 0
:limit 10}))))
; ~ 24540ms / query
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment