Last active
December 17, 2015 16:49
-
-
Save a2ndrade/5641681 to your computer and use it in GitHub Desktop.
Datomic: Finding duplicate attribute values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; see http://stackoverflow.com/questions/16727590/what-is-a-good-way-of-finding-duplicates-in-datomic | |
(require '[datomic.api :as d]) | |
(def uri "datomic:mem://duplicate-values") | |
(d/create-database uri) | |
(def conn (d/connect uri)) | |
;; sample attribute definiton | |
(d/transact conn [{:db.install/_attribute :db.part/db | |
:db/id #db/id[:db.part/db] | |
:db/ident :business/name | |
:db/valueType :db.type/string | |
:db/cardinality :db.cardinality/one}]) | |
;; sample data | |
(d/transact conn [{:db/id #db/id[:db.part/user] :business/name "a"} | |
{:db/id #db/id[:db.part/user] :business/name "b"} | |
{:db/id #db/id[:db.part/user] :business/name "c"} | |
{:db/id #db/id[:db.part/user] :business/name "a"} | |
{:db/id #db/id[:db.part/user] :business/name "b"} | |
{:db/id #db/id[:db.part/user] :business/name "b"}]) | |
(defn find-duplicate-values [db attr-name] | |
"Returns a map from distinct attr-name values to the number | |
of entities with the same attribute value in the database" | |
(->> (d/datoms db :aevt attr-name) | |
(map :v) | |
(frequencies) | |
(filter #(> (second %) 1)) | |
(into {}))) | |
(defn find-duplicate-values-datalog [db attr-name] | |
"Returns a map from distinct attr-name values to the number of | |
entities with the same attribute value in the database" | |
(->> | |
(d/q '[:find (frequencies ?v) | |
:with ?e | |
:in $ ?a | |
:where [?e ?a ?v]] | |
db attr-name) | |
(ffirst) | |
(filter #(> (second %) 1)) | |
(into {}))) | |
(defn find-duplicate-values-entids [db attr-name] | |
"Returns a sequence of entity ids with duplicate | |
values for attr-name in the database" | |
(->> (d/datoms db :aevt attr-name) | |
(group-by :v) | |
(filter #(> (count (second %)) 1)) | |
(mapcat second) | |
(map :e))) | |
(defn find-duplicate-values-entids-datalog [db attr-name] | |
"Returns a sequence of entity ids with duplicate | |
values for attr-name in the database" | |
(->> (d/q '[:find ?e ?v | |
:in $ ?a | |
:where [?e ?a ?v]] | |
db attr-name) | |
(group-by second) | |
(filter #(> (count (second %)) 1)) | |
(mapcat second) | |
(map first))) | |
(defn find-distinct-values [db attr-name] | |
"Returns a sequence of the distinct values for attr-name | |
in the database" | |
(->> (d/datoms db :aevt attr-name) | |
(map :v) | |
(distinct))) | |
(defn find-distinct-values-datalog [db attr-name] | |
"Returns a sequence of the distinct values for attr-name | |
in the database" | |
(map first (d/q '[:find ?v | |
:in $ ?a | |
:where [_ ?a ?v]] | |
db attr-name))) | |
;; DEMO | |
(def attr-name :business/name) | |
(def db (d/db conn)) | |
(find-duplicate-values db attr-name) | |
(find-duplicate-values-datalog db attr-name) | |
;; e.g. {"a" 2, "b" 3} | |
(find-duplicate-values-entids db attr-name) | |
(find-duplicate-values-entids-datalog db attr-name) | |
;; e.g. (17592186045418 17592186045421 17592186045419 17592186045422 17592186045423) | |
(find-distinct-values db attr-name) | |
(find-distinct-values-datalog db attr-name) | |
;; e.g. ("a" "b" "c") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment