Skip to content

Instantly share code, notes, and snippets.

@genmeblog
Last active March 21, 2021 18:39
Show Gist options
  • Save genmeblog/18d6ed84224cbbef656adac4d85cc7ec to your computer and use it in GitHub Desktop.
Save genmeblog/18d6ed84224cbbef656adac4d85cc7ec to your computer and use it in GitHub Desktop.
Using TreeMap index to subselect datasets
(require '[clojure.set]
'[tablecloth.api :as api])
(defprotocol IndexProto
(slice-idx [idx ks] [idx from to] [idx from from-inclusive? to to-inclusive?]
"Slice by keys or range")
(select-by-idx [idx ks] [idx from to]
"Select by keys or range"))
;; TreeMap as an index
(extend-type TreeMap
IndexProto
(slice-idx
([idx ks]
(let [^TreeMap nidx (.clone ^TreeMap idx)
s (clojure.set/difference (set (.keySet nidx)) (set ks))]
(doseq [k s]
(.remove nidx k))
nidx))
([idx from to]
(slice-idx idx from true to true))
([idx from from-inclusive? to to-inclusive?]
(.subMap ^TreeMap idx from from-inclusive? to to-inclusive?)))
(select-by-idx
([idx from to]
(select-by-idx idx (keys (slice-idx idx from to))))
([idx ks]
(mapcat identity (map #(.get ^TreeMap idx %) ks)))))
;; build index using group-by path and Clojure comparator to allow `nil` and vector based keys
(defn build-index
([ds grouping] (build-index ds grouping compare))
([ds grouping comparator]
(let [g (api/group-by ds grouping {:result-type :as-indexes})
^TreeMap tm (TreeMap. comparator)]
(.putAll tm g)
tm)))
;; test dataset with missing values
(def ds (api/dataset {:id (repeatedly 30 #(when (< (rand) 0.8)
(rand-int 5)))
:v (repeatedly 30 #(rand-nth [:a :b :c :d :e]))}))
ds;; => _unnamed [30 2]:
;; | :id | :v |
;; |--------|----------|
;; | | :c |
;; | 3 | :b |
;; | 2 | :e |
;; | 3 | :b |
;; | 0 | :c |
;; | 0 | :b |
;; | | :b |
;; | 4 | :d |
;; | 0 | :d |
;; | 1 | :e |
;; | 2 | :a |
;; | 0 | :b |
;; | 4 | :c |
;; | 1 | :d |
;; | 3 | :b |
;; | 2 | :c |
;; | 1 | :a |
;; | 4 | :e |
;; | 1 | :b |
;; | | :a |
;; | 2 | :b |
;; | 1 | :d |
;; | 4 | :d |
;; | 4 | :b |
;; | 0 | :b |
;; build index
(def idx (build-index ds :id))
idx
;; => {nil #list<int32>[4]
;; [0, 6, 19, 29], 0 #list<int32>[5]
;; [4, 5, 8, 11, 24], 1 #list<int32>[5]
;; [9, 13, 16, 18, 21], 2 #list<int32>[5]
;; [2, 10, 15, 20, 26], 3 #list<int32>[4]
;; [1, 3, 14, 28], 4 #list<int32>[7]
;; [7, 12, 17, 22, 23, 25, 27]}
(slice-idx idx 0 1)
;; => {0 #list<int32>[5]
;; [4, 5, 8, 11, 24], 1 #list<int32>[5]
;; [9, 13, 16, 18, 21]}
(slice-idx idx [2 4 0])
;; => {0 #list<int32>[5]
;; [4, 5, 8, 11, 24], 2 #list<int32>[5]
;; [2, 10, 15, 20, 26], 4 #list<int32>[7]
;; [7, 12, 17, 22, 23, 25, 27]}
(select-by-idx idx 0 1)
;; => (4 5 8 11 24 9 13 16 18 21)
(select-by-idx idx [2 4 0])
;; => (2 10 15 20 26 7 12 17 22 23 25 27 4 5 8 11 24)
;; custom function to select rows using an index
(defn select-by
([ds idx from to]
(api/select-rows ds (mapcat identity (vals (slice-idx idx from to)))))
([ds idx ks]
(api/select-rows ds (select-by-idx idx ks))))
(select-by ds idx 2 3)
;; => _unnamed [9 2]:
;; | :id | :v |
;; |--------|----------|
;; | 2 | :e |
;; | 2 | :a |
;; | 2 | :c |
;; | 2 | :b |
;; | 2 | :a |
;; | 3 | :b |
;; | 3 | :b |
;; | 3 | :b |
;; | 3 | :e |
(select-by ds idx [0 3])
;; => _unnamed [9 2]:
;; | :id | :v |
;; |--------|----------|
;; | 0 | :c |
;; | 0 | :b |
;; | 0 | :d |
;; | 0 | :b |
;; | 0 | :b |
;; | 3 | :b |
;; | 3 | :b |
;; | 3 | :b |
;; | 3 | :e |
;; grouping works without any change
(api/group-by ds idx)
;; => _unnamed [6 3]:
;; | :name | :group-id | :data |
;; |--------|-----------|-----------------|
;; | | 0 | Group: [4 2]: |
;; | 0 | 1 | Group: 0 [5 2]: |
;; | 1 | 2 | Group: 1 [5 2]: |
;; | 2 | 3 | Group: 2 [5 2]: |
;; | 3 | 4 | Group: 3 [4 2]: |
;; | 4 | 5 | Group: 4 [7 2]: |
(api/group-by ds (slice-idx idx [2 0]))
;; => _unnamed [2 3]:
;; | :name | :group-id | :data |
;; |--------|-----------|-----------------|
;; | 0 | 0 | Group: 0 [5 2]: |
;; | 2 | 1 | Group: 2 [5 2]: |
;;
;; index from both fields
(def idx2 (build-index ds (juxt :id :v)))
(select-by ds idx2 [nil :c] [2 :b])
;; => _unnamed [15 2]:
;; | :id | :v |
;; |--------|----------|
;; | | :c |
;; | | :e |
;; | 0 | :b |
;; | 0 | :b |
;; | 0 | :b |
;; | 0 | :c |
;; | 0 | :d |
;; | 1 | :a |
;; | 1 | :b |
;; | 1 | :d |
;; | 1 | :d |
;; | 1 | :e |
;; | 2 | :a |
;; | 2 | :a |
;; | 2 | :b |
(select-by ds idx2 [[1 :a] [2 :c] [nil :a]])
;; => _unnamed [3 2]:
;; | :id | :v |
;; |--------|----------|
;; | 1 | :a |
;; | 2 | :c |
;; | | :a |
(api/group-by ds idx2)
;; => _unnamed [21 3]:
;; | :name | :group-id | :data |
;; |--------------------|-----------|------------------------|
;; | [nil :a] | 0 | Group: [nil :a] [1 2]: |
;; | [nil :b] | 1 | Group: [nil :b] [1 2]: |
;; | [nil :c] | 2 | Group: [nil :c] [1 2]: |
;; | [nil :e] | 3 | Group: [nil :e] [1 2]: |
;; | [0 :b] | 4 | Group: [0 :b] [3 2]: |
;; | [0 :c] | 5 | Group: [0 :c] [1 2]: |
;; | [0 :d] | 6 | Group: [0 :d] [1 2]: |
;; | [1 :a] | 7 | Group: [1 :a] [1 2]: |
;; | [1 :b] | 8 | Group: [1 :b] [1 2]: |
;; | [1 :d] | 9 | Group: [1 :d] [2 2]: |
;; | [1 :e] | 10 | Group: [1 :e] [1 2]: |
;; | [2 :a] | 11 | Group: [2 :a] [2 2]: |
;; | [2 :b] | 12 | Group: [2 :b] [1 2]: |
;; | [2 :c] | 13 | Group: [2 :c] [1 2]: |
;; | [2 :e] | 14 | Group: [2 :e] [1 2]: |
;; | [3 :b] | 15 | Group: [3 :b] [3 2]: |
;; | [3 :e] | 16 | Group: [3 :e] [1 2]: |
;; | [4 :b] | 17 | Group: [4 :b] [2 2]: |
;; | [4 :c] | 18 | Group: [4 :c] [1 2]: |
;; | [4 :d] | 19 | Group: [4 :d] [2 2]: |
;; | [4 :e] | 20 | Group: [4 :e] [2 2]: |
;; build index for even/odd split
(def idx3 (build-index ds #(when-let [id (:id %)]
(even? id))))
;; even rows
(select-by ds idx3 [true])
;; => _unnamed [17 2]:
;; | :id | :v |
;; |--------|----------|
;; | 2 | :e |
;; | 0 | :c |
;; | 0 | :b |
;; | 4 | :d |
;; | 0 | :d |
;; | 2 | :a |
;; | 0 | :b |
;; | 4 | :c |
;; | 2 | :c |
;; | 4 | :e |
;; | 2 | :b |
;; | 4 | :d |
;; | 4 | :b |
;; | 0 | :b |
;; | 4 | :e |
;; | 2 | :a |
;; | 4 | :b |
;; odd rows
(select-by ds idx3 [false])
;; => _unnamed [9 2]:
;; | :id | :v |
;; |--------|----------|
;; | 3 | :b |
;; | 3 | :b |
;; | 1 | :e |
;; | 1 | :d |
;; | 3 | :b |
;; | 1 | :a |
;; | 1 | :b |
;; | 1 | :d |
;; | 3 | :e |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment