Last active
March 21, 2021 18:39
-
-
Save genmeblog/18d6ed84224cbbef656adac4d85cc7ec to your computer and use it in GitHub Desktop.
Using TreeMap index to subselect datasets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(require '[clojure.set] | |
'[tablecloth.api :as api]) | |
(defprotocol IndexProto | |
(slice-idx [idx ks] [idx from to] [idx from from-inclusive? to to-inclusive?] | |
"Slice by keys or range") | |
(select-by-idx [idx ks] [idx from to] | |
"Select by keys or range")) | |
;; TreeMap as an index | |
(extend-type TreeMap | |
IndexProto | |
(slice-idx | |
([idx ks] | |
(let [^TreeMap nidx (.clone ^TreeMap idx) | |
s (clojure.set/difference (set (.keySet nidx)) (set ks))] | |
(doseq [k s] | |
(.remove nidx k)) | |
nidx)) | |
([idx from to] | |
(slice-idx idx from true to true)) | |
([idx from from-inclusive? to to-inclusive?] | |
(.subMap ^TreeMap idx from from-inclusive? to to-inclusive?))) | |
(select-by-idx | |
([idx from to] | |
(select-by-idx idx (keys (slice-idx idx from to)))) | |
([idx ks] | |
(mapcat identity (map #(.get ^TreeMap idx %) ks))))) | |
;; build index using group-by path and Clojure comparator to allow `nil` and vector based keys | |
(defn build-index | |
([ds grouping] (build-index ds grouping compare)) | |
([ds grouping comparator] | |
(let [g (api/group-by ds grouping {:result-type :as-indexes}) | |
^TreeMap tm (TreeMap. comparator)] | |
(.putAll tm g) | |
tm))) | |
;; test dataset with missing values | |
(def ds (api/dataset {:id (repeatedly 30 #(when (< (rand) 0.8) | |
(rand-int 5))) | |
:v (repeatedly 30 #(rand-nth [:a :b :c :d :e]))})) | |
ds;; => _unnamed [30 2]: | |
;; | :id | :v | | |
;; |--------|----------| | |
;; | | :c | | |
;; | 3 | :b | | |
;; | 2 | :e | | |
;; | 3 | :b | | |
;; | 0 | :c | | |
;; | 0 | :b | | |
;; | | :b | | |
;; | 4 | :d | | |
;; | 0 | :d | | |
;; | 1 | :e | | |
;; | 2 | :a | | |
;; | 0 | :b | | |
;; | 4 | :c | | |
;; | 1 | :d | | |
;; | 3 | :b | | |
;; | 2 | :c | | |
;; | 1 | :a | | |
;; | 4 | :e | | |
;; | 1 | :b | | |
;; | | :a | | |
;; | 2 | :b | | |
;; | 1 | :d | | |
;; | 4 | :d | | |
;; | 4 | :b | | |
;; | 0 | :b | | |
;; build index | |
(def idx (build-index ds :id)) | |
idx | |
;; => {nil #list<int32>[4] | |
;; [0, 6, 19, 29], 0 #list<int32>[5] | |
;; [4, 5, 8, 11, 24], 1 #list<int32>[5] | |
;; [9, 13, 16, 18, 21], 2 #list<int32>[5] | |
;; [2, 10, 15, 20, 26], 3 #list<int32>[4] | |
;; [1, 3, 14, 28], 4 #list<int32>[7] | |
;; [7, 12, 17, 22, 23, 25, 27]} | |
(slice-idx idx 0 1) | |
;; => {0 #list<int32>[5] | |
;; [4, 5, 8, 11, 24], 1 #list<int32>[5] | |
;; [9, 13, 16, 18, 21]} | |
(slice-idx idx [2 4 0]) | |
;; => {0 #list<int32>[5] | |
;; [4, 5, 8, 11, 24], 2 #list<int32>[5] | |
;; [2, 10, 15, 20, 26], 4 #list<int32>[7] | |
;; [7, 12, 17, 22, 23, 25, 27]} | |
(select-by-idx idx 0 1) | |
;; => (4 5 8 11 24 9 13 16 18 21) | |
(select-by-idx idx [2 4 0]) | |
;; => (2 10 15 20 26 7 12 17 22 23 25 27 4 5 8 11 24) | |
;; custom function to select rows using an index | |
(defn select-by | |
([ds idx from to] | |
(api/select-rows ds (mapcat identity (vals (slice-idx idx from to))))) | |
([ds idx ks] | |
(api/select-rows ds (select-by-idx idx ks)))) | |
(select-by ds idx 2 3) | |
;; => _unnamed [9 2]: | |
;; | :id | :v | | |
;; |--------|----------| | |
;; | 2 | :e | | |
;; | 2 | :a | | |
;; | 2 | :c | | |
;; | 2 | :b | | |
;; | 2 | :a | | |
;; | 3 | :b | | |
;; | 3 | :b | | |
;; | 3 | :b | | |
;; | 3 | :e | | |
(select-by ds idx [0 3]) | |
;; => _unnamed [9 2]: | |
;; | :id | :v | | |
;; |--------|----------| | |
;; | 0 | :c | | |
;; | 0 | :b | | |
;; | 0 | :d | | |
;; | 0 | :b | | |
;; | 0 | :b | | |
;; | 3 | :b | | |
;; | 3 | :b | | |
;; | 3 | :b | | |
;; | 3 | :e | | |
;; grouping works without any change | |
(api/group-by ds idx) | |
;; => _unnamed [6 3]: | |
;; | :name | :group-id | :data | | |
;; |--------|-----------|-----------------| | |
;; | | 0 | Group: [4 2]: | | |
;; | 0 | 1 | Group: 0 [5 2]: | | |
;; | 1 | 2 | Group: 1 [5 2]: | | |
;; | 2 | 3 | Group: 2 [5 2]: | | |
;; | 3 | 4 | Group: 3 [4 2]: | | |
;; | 4 | 5 | Group: 4 [7 2]: | | |
(api/group-by ds (slice-idx idx [2 0])) | |
;; => _unnamed [2 3]: | |
;; | :name | :group-id | :data | | |
;; |--------|-----------|-----------------| | |
;; | 0 | 0 | Group: 0 [5 2]: | | |
;; | 2 | 1 | Group: 2 [5 2]: | | |
;; | |
;; index from both fields | |
(def idx2 (build-index ds (juxt :id :v))) | |
(select-by ds idx2 [nil :c] [2 :b]) | |
;; => _unnamed [15 2]: | |
;; | :id | :v | | |
;; |--------|----------| | |
;; | | :c | | |
;; | | :e | | |
;; | 0 | :b | | |
;; | 0 | :b | | |
;; | 0 | :b | | |
;; | 0 | :c | | |
;; | 0 | :d | | |
;; | 1 | :a | | |
;; | 1 | :b | | |
;; | 1 | :d | | |
;; | 1 | :d | | |
;; | 1 | :e | | |
;; | 2 | :a | | |
;; | 2 | :a | | |
;; | 2 | :b | | |
(select-by ds idx2 [[1 :a] [2 :c] [nil :a]]) | |
;; => _unnamed [3 2]: | |
;; | :id | :v | | |
;; |--------|----------| | |
;; | 1 | :a | | |
;; | 2 | :c | | |
;; | | :a | | |
(api/group-by ds idx2) | |
;; => _unnamed [21 3]: | |
;; | :name | :group-id | :data | | |
;; |--------------------|-----------|------------------------| | |
;; | [nil :a] | 0 | Group: [nil :a] [1 2]: | | |
;; | [nil :b] | 1 | Group: [nil :b] [1 2]: | | |
;; | [nil :c] | 2 | Group: [nil :c] [1 2]: | | |
;; | [nil :e] | 3 | Group: [nil :e] [1 2]: | | |
;; | [0 :b] | 4 | Group: [0 :b] [3 2]: | | |
;; | [0 :c] | 5 | Group: [0 :c] [1 2]: | | |
;; | [0 :d] | 6 | Group: [0 :d] [1 2]: | | |
;; | [1 :a] | 7 | Group: [1 :a] [1 2]: | | |
;; | [1 :b] | 8 | Group: [1 :b] [1 2]: | | |
;; | [1 :d] | 9 | Group: [1 :d] [2 2]: | | |
;; | [1 :e] | 10 | Group: [1 :e] [1 2]: | | |
;; | [2 :a] | 11 | Group: [2 :a] [2 2]: | | |
;; | [2 :b] | 12 | Group: [2 :b] [1 2]: | | |
;; | [2 :c] | 13 | Group: [2 :c] [1 2]: | | |
;; | [2 :e] | 14 | Group: [2 :e] [1 2]: | | |
;; | [3 :b] | 15 | Group: [3 :b] [3 2]: | | |
;; | [3 :e] | 16 | Group: [3 :e] [1 2]: | | |
;; | [4 :b] | 17 | Group: [4 :b] [2 2]: | | |
;; | [4 :c] | 18 | Group: [4 :c] [1 2]: | | |
;; | [4 :d] | 19 | Group: [4 :d] [2 2]: | | |
;; | [4 :e] | 20 | Group: [4 :e] [2 2]: | | |
;; build index for even/odd split | |
(def idx3 (build-index ds #(when-let [id (:id %)] | |
(even? id)))) | |
;; even rows | |
(select-by ds idx3 [true]) | |
;; => _unnamed [17 2]: | |
;; | :id | :v | | |
;; |--------|----------| | |
;; | 2 | :e | | |
;; | 0 | :c | | |
;; | 0 | :b | | |
;; | 4 | :d | | |
;; | 0 | :d | | |
;; | 2 | :a | | |
;; | 0 | :b | | |
;; | 4 | :c | | |
;; | 2 | :c | | |
;; | 4 | :e | | |
;; | 2 | :b | | |
;; | 4 | :d | | |
;; | 4 | :b | | |
;; | 0 | :b | | |
;; | 4 | :e | | |
;; | 2 | :a | | |
;; | 4 | :b | | |
;; odd rows | |
(select-by ds idx3 [false]) | |
;; => _unnamed [9 2]: | |
;; | :id | :v | | |
;; |--------|----------| | |
;; | 3 | :b | | |
;; | 3 | :b | | |
;; | 1 | :e | | |
;; | 1 | :d | | |
;; | 3 | :b | | |
;; | 1 | :a | | |
;; | 1 | :b | | |
;; | 1 | :d | | |
;; | 3 | :e | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment