Skip to content

Instantly share code, notes, and snippets.

@a2ndrade
Last active December 17, 2015 16:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save a2ndrade/5641681 to your computer and use it in GitHub Desktop.
Save a2ndrade/5641681 to your computer and use it in GitHub Desktop.
Datomic: Finding duplicate attribute values
;; see http://stackoverflow.com/questions/16727590/what-is-a-good-way-of-finding-duplicates-in-datomic
(require '[datomic.api :as d])
(def uri "datomic:mem://duplicate-values")
(d/create-database uri)
(def conn (d/connect uri))
;; sample attribute definiton
(d/transact conn [{:db.install/_attribute :db.part/db
:db/id #db/id[:db.part/db]
:db/ident :business/name
:db/valueType :db.type/string
:db/cardinality :db.cardinality/one}])
;; sample data
(d/transact conn [{:db/id #db/id[:db.part/user] :business/name "a"}
{:db/id #db/id[:db.part/user] :business/name "b"}
{:db/id #db/id[:db.part/user] :business/name "c"}
{:db/id #db/id[:db.part/user] :business/name "a"}
{:db/id #db/id[:db.part/user] :business/name "b"}
{:db/id #db/id[:db.part/user] :business/name "b"}])
(defn find-duplicate-values [db attr-name]
"Returns a map from distinct attr-name values to the number
of entities with the same attribute value in the database"
(->> (d/datoms db :aevt attr-name)
(map :v)
(frequencies)
(filter #(> (second %) 1))
(into {})))
(defn find-duplicate-values-datalog [db attr-name]
"Returns a map from distinct attr-name values to the number of
entities with the same attribute value in the database"
(->>
(d/q '[:find (frequencies ?v)
:with ?e
:in $ ?a
:where [?e ?a ?v]]
db attr-name)
(ffirst)
(filter #(> (second %) 1))
(into {})))
(defn find-duplicate-values-entids [db attr-name]
"Returns a sequence of entity ids with duplicate
values for attr-name in the database"
(->> (d/datoms db :aevt attr-name)
(group-by :v)
(filter #(> (count (second %)) 1))
(mapcat second)
(map :e)))
(defn find-duplicate-values-entids-datalog [db attr-name]
"Returns a sequence of entity ids with duplicate
values for attr-name in the database"
(->> (d/q '[:find ?e ?v
:in $ ?a
:where [?e ?a ?v]]
db attr-name)
(group-by second)
(filter #(> (count (second %)) 1))
(mapcat second)
(map first)))
(defn find-distinct-values [db attr-name]
"Returns a sequence of the distinct values for attr-name
in the database"
(->> (d/datoms db :aevt attr-name)
(map :v)
(distinct)))
(defn find-distinct-values-datalog [db attr-name]
"Returns a sequence of the distinct values for attr-name
in the database"
(map first (d/q '[:find ?v
:in $ ?a
:where [_ ?a ?v]]
db attr-name)))
;; DEMO
(def attr-name :business/name)
(def db (d/db conn))
(find-duplicate-values db attr-name)
(find-duplicate-values-datalog db attr-name)
;; e.g. {"a" 2, "b" 3}
(find-duplicate-values-entids db attr-name)
(find-duplicate-values-entids-datalog db attr-name)
;; e.g. (17592186045418 17592186045421 17592186045419 17592186045422 17592186045423)
(find-distinct-values db attr-name)
(find-distinct-values-datalog db attr-name)
;; e.g. ("a" "b" "c")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment