|
(ns omegawiki.datomic |
|
" |
|
Omegawiki data import to datomic |
|
|
|
Needs about 800 MB memory when fully loaded. More during transacting! |
|
|
|
# Notes: |
|
|
|
We could unify :ow.expr/lang and :ow.option/lang (etc) to :ow/lang |
|
-> Adv: Simpler walking the graph, same meaning |
|
-> Disadv: A littler harder querying (no more e-by-av but real queries needed)." |
|
(:require |
|
[clojure.java.jdbc :as jdbc] |
|
[datomic.api :as da] |
|
[clojure.string :as str]) |
|
(:import |
|
(datomic Entity))) |
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
;; MYSQL HELPERS |
|
(defn stringify-bytes |
|
[s] |
|
(String. ^bytes s)) |
|
|
|
(defn update* |
|
[m k & args] |
|
(if-let [[k v] (find m k)] |
|
(apply update m k args) |
|
m)) |
|
|
|
(defn row-fn |
|
[row] |
|
(-> row |
|
(update* :table keyword) |
|
(update* :spelling stringify-bytes) |
|
(update* :type_spelling stringify-bytes) |
|
(update* :value_spelling stringify-bytes) |
|
(update* :text stringify-bytes) |
|
(update* :text_text stringify-bytes) |
|
(update* :text_flags stringify-bytes))) |
|
|
|
(defn q |
|
"Queries the Omegawiki mysql db." |
|
[& args] |
|
(jdbc/query {:subprotocol "mysql" |
|
:subname "//127.0.0.1:3306/ow" |
|
:user "ow" |
|
:password "foobarlol"} |
|
(keep identity args) |
|
{:row-fn row-fn})) |
|
|
|
(defn ffilter |
|
"First filter." |
|
[f xs] |
|
(first (filter f xs))) |
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
;; DATOMIC SETUP: |
|
(def ^:const uri "datomic:mem://ow") |
|
|
|
(def conn |
|
(do |
|
(da/create-database uri) |
|
(da/connect uri))) |
|
#_(da/delete-database uri) |
|
|
|
(defn db [] (da/db conn)) |
|
|
|
(defn num-datoms |
|
"Just for fun. ~9.2M Datoms" |
|
[] |
|
(count (seq (da/datoms (db) :eavt)))) |
|
|
|
(def object-table->ref-id |
|
"Maps the objects in the ojbect table to the ids in datomic these map to." |
|
{:uw_translated_content_attribute_values :ow.trans-content.attr/id, |
|
:uw_class_attributes :ow.attr/id |
|
:uw_class_membership :ow.class/id |
|
:uw_text_attribute_values :ow.text.attr/id, |
|
:uw_syntrans :ow.syntrans/id, |
|
:uw_expression :ow.expr/id, |
|
:uw_expression_ns nil, ;; Table doesn't exist anymore... |
|
:uw_translated_content :ow.tc/id, |
|
:uw_defined_meaning :ow.meaning/id, |
|
:uw_meaning_relations :ow.relation/id, |
|
:uw_collection :ow.coll/id, |
|
:uw_option_attribute_options :ow.option/id, |
|
:uw_url_attribute_values :ow.url/id, |
|
:uw_option_attribute_values :ow.attr.val/id, |
|
:translated_content nil ;; Table doesn't exist anymore? |
|
:uw_collection_ns nil}) |
|
|
|
(defn t! |
|
[tx] |
|
@(da/transact conn tx)) |
|
|
|
(defn t |
|
"Shorthand for touch" |
|
[x] |
|
(when (and x (instance? Entity x)) |
|
(da/touch x))) |
|
|
|
(defn tc |
|
"Shorthand for touch of all children" |
|
[ent] |
|
(when ent |
|
(run! (comp t #(% ent)) (keys ent)) |
|
ent)) |
|
|
|
(defn es-by-a |
|
"Returns all entities with the given attribute." |
|
[a] |
|
(let [db (db)] |
|
(map (fn [[e]] |
|
(t (da/entity db e))) |
|
(da/datoms db :aevt a)))) |
|
#_(es-by-a :db/doc) |
|
|
|
(defn es-by-av |
|
"Returns all entities with the given attribute + value." |
|
[a v] |
|
(let [db (db)] |
|
(map (fn [[e]] |
|
(t (da/entity db e))) |
|
(da/datoms db :avet a v)))) |
|
#_(es-by-av :foo 8) |
|
|
|
(defn fulltext |
|
[attr txt] |
|
(let [db (db)] |
|
(map (fn [[e score]] |
|
(t (da/entity db e))) |
|
(sort-by (comp - second) |
|
(da/q '[:find ?e ?score |
|
:in $ ?attr ?txt |
|
:where [(fulltext $ ?attr ?txt) [[?e _ _ ?score]]]] |
|
db attr txt))))) |
|
#_(take 7 (fulltext :ow.expr/spelling "accident")) |
|
|
|
(defn e-by-av |
|
"An single entity for the given attr + value" |
|
[a v] |
|
(t (first (es-by-av a v)))) |
|
|
|
(defn e-by-a |
|
"Eagerly returns all entities with the given attribute + value." |
|
[a] |
|
(t (first (es-by-a a)))) |
|
|
|
(defn entity |
|
"Datomic entity with touch." |
|
[eid] |
|
(t (da/entity (db) eid))) |
|
|
|
(defn ident->str |
|
[kw] |
|
(subs (str kw) 1)) |
|
|
|
;; Problem: |
|
;; Omegawiki sometimes differentiates between US/UK english. |
|
;; So lang/id 85 (normal "eng") might not be used for english expression and instead |
|
;; "eng-UK" or "eng-US" is used (as ISO3) |
|
(def iso3->iso1 |
|
"Note: This map is NOT invertable. We're mapping to the same ISO1 sometimes |
|
(eg eng-US and eng-UK are both \"en\" in iso1)" |
|
{"abk" "ab" "aar" "aa" "afr" "af" "aka" "ak" "sqi" "sq" "amh" "am" "ara" "ar" "arg" "an" |
|
"hye" "hy" "asm" "as" "ava" "av" "ave" "ae" "aym" "ay" "aze" "az" "bam" "bm" "bak" "ba" |
|
"eus" "eu" "bel" "be" "ben" "bn" "bih" "bh" "bis" "bi" "bos" "bs" "bre" "br" "bul" "bg" |
|
"mya" "my" "cat" "ca" "cha" "ch" "che" "ce" "nya" "ny" "zho" "zh" "chv" "cv" "cor" "kw" |
|
"cos" "co" "cre" "cr" "hrv" "hr" "ces" "cs" "dan" "da" "div" "dv" "nld" "nl" "dzo" "dz" |
|
"eng" "en" "epo" "eo" "est" "et" "ewe" "ee" "fao" "fo" "fij" "fj" "fin" "fi" "fra" "fr" |
|
"ful" "ff" "glg" "gl" "kat" "ka" "deu" "de" "ell" "el" "grn" "gn" "guj" "gu" "hat" "ht" |
|
"hau" "ha" "heb" "he" "her" "hz" "hin" "hi" "hmo" "ho" "hun" "hu" "ina" "ia" "ind" "id" |
|
"ile" "ie" "gle" "ga" "ibo" "ig" "ipk" "ik" "ido" "io" "isl" "is" "ita" "it" "iku" "iu" |
|
"jpn" "ja" "jav" "jv" "kal" "kl" "kan" "kn" "kau" "kr" "kas" "ks" "kaz" "kk" "khm" "km" |
|
"kik" "ki" "kin" "rw" "kir" "ky" "kom" "kv" "kon" "kg" "kor" "ko" "kur" "ku" "kua" "kj" |
|
"lat" "la" "ltz" "lb" "lug" "lg" "lim" "li" "lin" "ln" "lao" "lo" "lit" "lt" "lub" "lu" |
|
"lav" "lv" "glv" "gv" "mkd" "mk" "mlg" "mg" "msa" "ms" "mal" "ml" "mlt" "mt" "mri" "mi" |
|
"mar" "mr" "mah" "mh" "mon" "mn" "nau" "na" "nav" "nv" "nde" "nd" "nep" "ne" "ndo" "ng" |
|
"nob" "nb" "nno" "nn" "nor" "no" "iii" "ii" "nbl" "nr" "oci" "oc" "oji" "oj" "chu" "cu" |
|
"orm" "om" "ori" "or" "oss" "os" "pan" "pa" "pli" "pi" "fas" "fa" "pol" "pl" "pus" "ps" |
|
"por" "pt" "que" "qu" "roh" "rm" "run" "rn" "ron" "ro" "rus" "ru" "san" "sa" "srd" "sc" |
|
"snd" "sd" "sme" "se" "smo" "sm" "sag" "sg" "srp" "sr" "gla" "gd" "sna" "sn" "sin" "si" |
|
"slk" "sk" "slv" "sl" "som" "so" "sot" "st" "spa" "es" "sun" "su" "swa" "sw" "ssw" "ss" |
|
"swe" "sv" "tam" "ta" "tel" "te" "tgk" "tg" "tha" "th" "tir" "ti" "bod" "bo" "tuk" "tk" |
|
"tgl" "tl" "tsn" "tn" "ton" "to" "tur" "tr" "tso" "ts" "tat" "tt" "twi" "tw" "tah" "ty" |
|
"uig" "ug" "ukr" "uk" "urd" "ur" "uzb" "uz" "ven" "ve" "vie" "vi" "vol" "vo" "wln" "wa" |
|
"cym" "cy" "wol" "wo" "fry" "fy" "xho" "xh" "yid" "yi" "yor" "yo" "zha" "za" "zul" "zu" |
|
;; Manually add some mappings: (TODO: Chinese mappings) |
|
"eng-UK" "en" |
|
"eng-US" "en"}) |
|
|
|
(defn t-datomic-schema! |
|
"Transacts the datomic schema!" |
|
[] |
|
(let [attr (fn attr |
|
[ident col doc] |
|
{:db/id (ident->str ident) ;; We use string temp ids to ref in our tx |
|
:db/ident ident |
|
:db/cardinality :db.cardinality/one |
|
:ow/columns {:db/id col |
|
:ow/column col} |
|
:db/doc doc}) |
|
many! #(assoc % :db/cardinality :db.cardinality/many) |
|
fulltext! #(assoc % :db/fulltext true) |
|
index! #(assoc % :db/index true) |
|
component! #(assoc % :db/isComponent true) |
|
id! #(assoc % :db/unique :db.unique/identity) |
|
str! #(assoc % :db/valueType :db.type/string) |
|
kw! #(assoc % :db/valueType :db.type/keyword) |
|
bool! #(assoc % :db/valueType :db.type/boolean) |
|
ref! (fn [m ident] |
|
(cond-> (assoc m :db/valueType :db.type/ref) |
|
ident (assoc :ow/refs (ident->str ident)))) ;; use string tempid |
|
long! #(assoc % :db/valueType :db.type/long)] |
|
;; Bootstrap the schema with a :ow/column so we can query the schema right away |
|
(t! [{:db/ident :ow/column |
|
:db/cardinality :db.cardinality/one |
|
:db/unique :db.unique/identity |
|
:db/valueType :db.type/string} |
|
{:db/ident :ow/columns |
|
:db/cardinality :db.cardinality/many |
|
;:db/unique :db.unique/identity |
|
:db/valueType :db.type/ref} |
|
;; We also document which :db/ident a ref references. |
|
;; We need this when transacting to wrap the transaction ref with that :db/ident: |
|
{:db/ident :ow/refs |
|
:db/cardinality :db.cardinality/one |
|
:db/valueType :db.type/ref}]) |
|
(->> |
|
[;; TABLE: language |
|
#_(q "describe language") |
|
#_(q "select * from language limit 10") |
|
#_(q "select COUNT(*) from language") |
|
(-> (attr :ow.lang/id "language.language_id" |
|
"The language id. PK of language table.") |
|
long! id!) |
|
(-> (attr :ow.lang/dialect-of "language.dialect_of_lid" |
|
"A ref to a language.") |
|
(ref! :ow.lang/id)) |
|
(-> (attr :ow.lang/iso2 "language.iso639_2" |
|
"The ISO 639-2 string. Sometimes not present! We can have the same ISO-2 Codes!!") |
|
str! index!) |
|
(-> (attr :ow.lang/iso1 "non-existing.iso1" |
|
"The ISO 639-1 string. Sometimes not present! We can have the same ISO-1 Codes!!") |
|
str! index!) |
|
(-> (attr :ow.lang/iso3 "language.iso639_3" |
|
"The ISO 639-3 string. Distinct.") |
|
str! id!) |
|
(-> (attr :ow.lang/wikimedia-key "language.wikimedia_key" |
|
"Sometimes NOT present!") |
|
str! id!) |
|
|
|
;; language_names TABLE: |
|
;; Not needed. We're fine with only english language names... |
|
|
|
;; uw_alt_meaningtexts TABLE: |
|
;; Not needed? Only ~450 entries. |
|
|
|
;; uw_bootstrapped_defined_meanings TABLE: |
|
;; Not needed. We can use idents for these 5 entries if needed. |
|
|
|
;; TABLE: uw_class_attributes |
|
(-> (attr :ow.attr/id "uw_class_attributes.object_id" |
|
"The object id. PK of table.") |
|
long! id!) |
|
(-> (attr :ow.attr/meaning "uw_class_attributes.class_mid" |
|
"A ref to a meaning.") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.attr/level "uw_class_attributes.level_mid" |
|
"A ref to a meaning.") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.attr/attr "uw_class_attributes.attribute_mid" |
|
"A ref to a meaning.") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.attr/type "uw_class_attributes.attribute_type" |
|
"Either :text, :optn, :dm, :url or :trns") |
|
kw! index!) |
|
|
|
;; TABLE: uw_class_membership |
|
(-> (attr :ow.class/id "uw_class_membership.class_membership_id" |
|
"The PK? (but not really?)") |
|
long! id!) |
|
(-> (attr :ow.class/meaning "uw_class_membership.class_mid" |
|
"Identifies the class (eg. \"animal\")") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.class/member "uw_class_membership.class_member_mid" |
|
"Identifies the member (eg . \"dog\")") |
|
(ref! :ow.meaning/id)) |
|
|
|
;; TABLE: uw_collection |
|
(-> (attr :ow.coll/id "uw_collection.collection_id" |
|
"The PK? (but not really?)") |
|
long! id!) |
|
(-> (attr :ow.coll/meaning "uw_collection.collection_mid" |
|
"A ref to the meaning") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.coll/type "uw_collection.collection_type" |
|
"A keyword, either :CLAS or :LANG") |
|
kw! index!) |
|
|
|
;; TABLE: uw_collection_contents |
|
(-> (attr :ow.coll.object/id "uw_collection_contents.object_id" |
|
"ALWAYS NIL! HMMM") |
|
long! id!) |
|
(-> (attr :ow.coll.object/coll "uw_collection_contents.collection_id" |
|
"A ref to the collection") |
|
(ref! :ow.coll/id)) |
|
(-> (attr :ow.coll.object/meaning "uw_collection_contents.member_mid" |
|
"A ref to the meaning") |
|
(ref! :ow.meaning/id)) |
|
|
|
;; TABLE: uw_collection_language |
|
;; => EMPTY ignore |
|
|
|
;; TABLE: uw_defined_meaning |
|
#_(q "select * from uw_defined_meaning LIMIT 20") |
|
(-> (attr :ow.meaning/id "uw_defined_meaning.defined_meaning_id" |
|
"The meaning id. PK of table.") |
|
long! id!) |
|
(-> (attr :ow.meaning/expr "uw_defined_meaning.expression_id" |
|
"A ref to the expression") |
|
(ref! :ow.expr/id)) |
|
(-> (attr :ow.meaning/text "uw_defined_meaning.meaning_text_tcid" |
|
"A ref to the translatable content") |
|
(ref! :ow.tc/id)) |
|
|
|
;; TABLE: uw_expression |
|
#_(q "select * from uw_expression limit 10") |
|
(-> (attr :ow.expr/id "uw_expression.expression_id" |
|
"The expression id. PK of table.") |
|
long! id!) |
|
(-> (attr :ow.expr/spelling "uw_expression.spelling" |
|
"The actual text") |
|
str! fulltext! index!) |
|
(-> (attr :ow.expr/lang "uw_expression.language_id" |
|
"A ref to a language") |
|
(ref! :ow.lang/id)) |
|
|
|
;; TABLE: uw_meaning_relations |
|
#_(q "select * from uw_meaning_relations limit 10") |
|
#_(q "select DISTINCT(relationtype_mid) from uw_meaning_relations limit 10") |
|
(-> (attr :ow.relation/id "uw_meaning_relations.relation_id" |
|
"PK of table.") |
|
long! id!) |
|
(-> (attr :ow.relation/meaning1 "uw_meaning_relations.meaning1_mid" |
|
"A ref to a meaning") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.relation/meaning2 "uw_meaning_relations.meaning2_mid" |
|
"A ref to a meaning") |
|
(ref! :ow.meaning/id)) |
|
;; TODO: Or to a attribute? I don't use this though... |
|
(-> (attr :ow.relation/type "uw_meaning_relations.relationtype_mid" |
|
"A ref to defined meaning") |
|
(ref! :ow.meaning/id)) |
|
|
|
;; TABLE: uw_objects |
|
;; TODO: This table is just aweful. It ref's rows in various tables |
|
;; We should hand transact this and create the refs to the table-objects |
|
;; We manually transform this table. Datomic can ref anything and is flexible... |
|
#_(q "select * from uw_objects limit 10") |
|
(-> (attr :ow.object/id "uw_objects.object_id" |
|
"PK of table.") |
|
long! id!) |
|
;; Still add table so we know what kind of ref this is. |
|
;; Better would be to do different refs like: |
|
;; - :ow/syntrans |
|
;; - :ow/expr .... etc. |
|
#_(-> (attr :ow.object/table "uw_objects.table" |
|
"The table keyword") |
|
index! kw!) |
|
|
|
#_(-> (attr :ow.object/ref "doesnt.exist.xyz" |
|
"The table keyword") |
|
;; What it refs depends on the table and is looked up with the map: |
|
;; object-table->ref-id (see above) |
|
(ref! nil)) |
|
|
|
;; TABLE: uw_option_attribute_options |
|
#_(q "select * from uw_option_attribute_options limit 10") |
|
#_(q "select COUNT(*) from uw_option_attribute_options limit 10") |
|
(-> (attr :ow.option/id "uw_option_attribute_options.option_id" |
|
"PK of table.") |
|
long! id!) |
|
(-> (attr :ow.option/attribute "uw_option_attribute_options.attribute_id" |
|
"A ref to an object") |
|
(ref! :ow.object/id)) |
|
(-> (attr :ow.option/meaning "uw_option_attribute_options.option_mid" |
|
"A ref to an defined meaning") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.option/lang "uw_option_attribute_options.language_id" |
|
"A ref to a language") |
|
(ref! :ow.lang/id)) |
|
|
|
|
|
;; TABLE: uw_option_attribute_values |
|
#_(q "describe uw_option_attribute_values") |
|
#_(q "select * from uw_option_attribute_values limit 10") |
|
#_(q "select COUNT(*) from uw_option_attribute_values") |
|
(-> (attr :ow.attr.val/id "uw_option_attribute_values.value_id" |
|
"PK of table.") |
|
long! id!) |
|
;; NOTE: This also refers to syntrans id's (when used for Part of speech, genus). |
|
(-> (attr :ow.attr.val/object "uw_option_attribute_values.object_id" |
|
"Ref to an object") |
|
(ref! :ow.object/id)) |
|
(-> (attr :ow.attr.val/option "uw_option_attribute_values.option_id" |
|
"Ref to an option") |
|
(ref! :ow.option/id)) |
|
|
|
;; TABLE: uw_syntrans |
|
#_(q "describe uw_syntrans") |
|
#_(q "select * from uw_syntrans limit 10") |
|
#_(q "select COUNT(*) from uw_syntrans") ;; 680k |
|
(-> (attr :ow.syntrans/id "uw_syntrans.syntrans_sid" |
|
"PK of table.") |
|
long! id!) |
|
(-> (attr :ow.syntrans/meaning "uw_syntrans.defined_meaning_id" |
|
"Ref to a meaning") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.syntrans/expr "uw_syntrans.expression_id" |
|
"Ref to an expression") |
|
(ref! :ow.expr/id)) |
|
(-> (attr :ow.syntrans/identical? "uw_syntrans.identical_meaning" |
|
"To avoid semantic drift. Specify if the translation is exact or approx.") |
|
bool!) |
|
|
|
;; TABLE: uw_text |
|
#_(q "describe uw_text") |
|
#_(q "select * from uw_text limit 10") |
|
#_(q "select DISTINCT(text_flags) from uw_text") ;; always "" |
|
#_(q "select COUNT(*) from uw_text") ;; 226k |
|
(-> (attr :ow.text/id "uw_text.text_id" |
|
"The PK.") |
|
long! id!) |
|
(-> (attr :ow.text/text "uw_text.text_text" |
|
"The main text. The long description for the Defined Meaning. Also examples?") |
|
str!) |
|
(-> (attr :ow.text/lang "doens't exist" |
|
;; SPECIAL! Manual |
|
"A ref to a language") |
|
(ref! :ow.lang/id)) |
|
;; text_flags is always "" so we omit it. |
|
|
|
;; TABLE: uw_text_attribute_values |
|
#_(q "describe uw_text_attribute_values") |
|
#_(q "select * from uw_text_attribute_values limit 10") |
|
#_(q "select COUNT(*) from uw_text_attribute_values") ;; 42k |
|
(-> (attr :ow.text.attr/id "uw_text_attribute_values.value_id" |
|
"The PK.") |
|
long! id!) |
|
(-> (attr :ow.text.attr/object "uw_text_attribute_values.object_id" |
|
"ref to an object from objects table") |
|
(ref! :ow.object/id)) |
|
(-> (attr :ow.text.attr/attr "uw_text_attribute_values.attribute_mid" |
|
"ref to a meaning") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.text.attr/text "uw_text_attribute_values.text" |
|
"The actual text string") |
|
str!) |
|
|
|
;; TABLE: uw_translated_content |
|
#_(q "describe uw_translated_content") |
|
#_(q "select * from uw_translated_content limit 10") |
|
#_(q "select COUNT(*) from uw_translated_content") ;; 226k |
|
;; NOTE: This table design is bad. A single :ow.text/id |
|
;; has only ever ONE language. So we should add the language ref to the |
|
;; :ow.text/id instead of in this table. We'll do this manually: |
|
;; NOT IDEMPOTENT since we have a many ref here! |
|
(-> (attr :ow.tc/id "uw_translated_content.translated_content_id" |
|
"The translated content id. PK of table.") |
|
long! id!) |
|
;; Ignore the language_id and don't add it here but instead ot the |
|
;; :ow.text/id entity itself! |
|
(-> (attr :ow.tc/text "uw_translated_content.text_id" |
|
"Many ref to a the texts") |
|
(ref! :ow.text/id) many!) |
|
|
|
;; TABLE: uw_translated_content_attribute_values |
|
#_(q "describe uw_translated_content_attribute_values") |
|
#_(q "select * from uw_translated_content_attribute_values limit 10") |
|
#_(q "select COUNT(*) from uw_translated_content_attribute_values") |
|
(-> (attr :ow.trans-content.attr/id "uw_translated_content_attribute_values.value_id" |
|
"PK of table.") |
|
long! id!) |
|
(-> (attr :ow.trans-content.attr/object "uw_translated_content_attribute_values.object_id" |
|
"Ref to an object") |
|
(ref! :ow.object/id)) |
|
(-> (attr :ow.trans-content.attr/attr "uw_translated_content_attribute_values.attribute_mid" |
|
"Ref to a meaning") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.trans-content.attr/tc "uw_translated_content_attribute_values.value_tcid" |
|
"Ref to a translated content") |
|
(ref! :ow.tc/id)) |
|
|
|
;; TABLE: uw_url_attribute_values |
|
#_(q "describe uw_url_attribute_values") |
|
#_(q "select * from uw_url_attribute_values limit 10") |
|
#_(q "select DISTINCT(label) from uw_url_attribute_values limit 10") |
|
#_(q "select COUNT(*) from uw_url_attribute_values") |
|
(-> (attr :ow.url/id "uw_url_attribute_values.value_id" |
|
"PK of table.") |
|
long! id!) |
|
(-> (attr :ow.url/object "uw_url_attribute_values.object_id" |
|
"Ref to an object") |
|
(ref! :ow.object/id)) |
|
(-> (attr :ow.url/attr "uw_url_attribute_values.attribute_mid" |
|
"Ref to a meaning") |
|
(ref! :ow.meaning/id)) |
|
(-> (attr :ow.url/url "uw_url_attribute_values.url" |
|
"The actual URL") |
|
str!) |
|
(-> (attr :ow.url/label "uw_url_attribute_values.label" |
|
"The link text for the URL") |
|
str!) |
|
;; TABLE: wikidata_sets |
|
;; HMM doesn't exist??? |
|
#_(q "describe wikidata_sets")] |
|
t!))) |
|
|
|
(defn key-translator |
|
"Given a single row of a query, will return a function that translates the keys |
|
to the datomic keys." |
|
[ex-map table strip-empty? transform] |
|
(let [kv-mapper (reduce-kv |
|
(fn [m k _] |
|
(let [column (e-by-av :ow/column (str table "." (name k))) |
|
schema-entries (:ow/_columns column) |
|
_ (assert (or (nil? schema-entries) (= 1 (count schema-entries))) |
|
(pr-str "HMM:" schema-entries (str table "." (name k)))) |
|
{:db/keys [ident], :ow/keys [refs]} (first schema-entries)] |
|
(assoc m k |
|
[ident (if refs |
|
(fn [v] {refs v}) |
|
(fn [v] v))]))) |
|
{} |
|
ex-map)] |
|
(fn [m] |
|
(into {} |
|
(map (fn [[k v]] |
|
(let [[mapped-key v-map] (get kv-mapper k) |
|
v ((get transform k identity) v)] |
|
(if (and strip-empty? |
|
(or (nil? v) (= "" v))) |
|
nil |
|
[mapped-key (v-map v)])))) |
|
m)))) |
|
|
|
(defn transact-table |
|
"Transacts the table into datomic. If strip-emtpy? is true all NULL and empty strings |
|
are not added to the entity (Done so we can still mark these attributes as :db.unique/..." |
|
[table more strip-empty? transform] |
|
(let [before (System/currentTimeMillis) |
|
data (q (str "Select * from " table " " more)) |
|
tr (key-translator (first data) table strip-empty? transform) |
|
tx-res (t! (mapv tr data))] |
|
(println (str "Entities: " (count data))) |
|
(println (str "Took: " (/ (- (System/currentTimeMillis) before) 1000.0) "s")) |
|
;; Return an example entity: |
|
(first (map tr data)) |
|
#_tx-res)) |
|
|
|
(defn import-all |
|
"TAKES SOME TIME! (2-3 min) |
|
Imports all the tables into datomic" |
|
[] |
|
;; Code is like this to use the REPL |
|
|
|
;; We need to first transact the object table since we need both of the identity |
|
;; attributes to belong to the same :db/id in Datomic: |
|
;; Manual transacting since the object table ref's various entities depending on the |
|
;; table. We take a simple approach here and simple add both, ow.xyz/id as well as |
|
;; ow.object/id to the same entity. That way we safe one intermediate hop/entity which |
|
;; adds nothing. |
|
(t! |
|
(into [] |
|
(keep |
|
(fn [{:keys [object_id table]}] |
|
(when-some [ref-ident (get object-table->ref-id table)] |
|
;; We also associate the object id to the same entity. |
|
;; That way we save "one hope" |
|
{:ow.object/id object_id |
|
ref-ident object_id}))) |
|
(q "select * from uw_objects"))) |
|
|
|
;; 500 -> 0s |
|
(transact-table "language" nil |
|
true |
|
;; Not interested in the dialect_of data: |
|
{:dialect_of_lid (constantly nil)}) |
|
(es-by-av :ow.lang/id 85) ;; Works English. |
|
|
|
;; Manually add ISO1 version which I need: |
|
(t! (reduce-kv |
|
(fn [xs iso3 iso1] |
|
(conj xs |
|
{:ow.lang/iso3 iso3 |
|
:ow.lang/iso1 iso1})) |
|
[] |
|
iso3->iso1)) |
|
|
|
;; 300 -> 0s |
|
(transact-table "uw_class_attributes" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:attribute_type keyword |
|
:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(es-by-av :ow.attr/type :TEXT) |
|
|
|
;; 11k -> 1s |
|
(transact-table "uw_class_membership" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(e-by-a :ow.class/id) |
|
|
|
;; 41 -> 0s |
|
(transact-table "uw_collection" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:collection_type keyword |
|
:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(e-by-a :ow.coll/id) |
|
|
|
;; TODO: This is n-m relation without an id. We should ref-many the collection the meaning |
|
;; 25k -> 2s |
|
(transact-table "uw_collection_contents" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:internal_member_id (constantly nil) ;; Not interested... |
|
:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
|
|
;; 55k -> 3s |
|
(transact-table "uw_defined_meaning" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(e-by-a :ow.meaning/id) |
|
|
|
;; 524k -> 26s |
|
(transact-table "uw_expression" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 10 (es-by-a :ow.expr/id)) |
|
(take 5 (fulltext :ow.expr/spelling "house")) |
|
|
|
;; 41k -> 3s |
|
(transact-table "uw_meaning_relations" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 5 (es-by-a :ow.relation/id)) |
|
|
|
;; 700 -> 0s |
|
(transact-table "uw_option_attribute_options" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 5 (es-by-a :ow.option/id)) |
|
|
|
|
|
;; 123k vals -> 5s |
|
(transact-table "uw_option_attribute_values" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 5 (es-by-a :ow.attr.val/id)) |
|
|
|
;; 600k values -> 40s |
|
(transact-table "uw_syntrans" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 5 (es-by-a :ow.syntrans/id)) |
|
|
|
;; 226k -> 5s |
|
;; We add some language entires to these entities later on! |
|
(transact-table "uw_text" |
|
"" |
|
true {:text_flags (constantly nil)}) |
|
(take 5 (es-by-a :ow.text/id)) |
|
|
|
;; 42k -> 3s |
|
(transact-table "uw_text_attribute_values" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 5 (es-by-a :ow.text.attr/id)) |
|
|
|
;; 173k -> 8s |
|
;; NOT IDEMPOTENT!! We have ref-many attrs in this one!!!!! |
|
(transact-table "uw_translated_content" |
|
"WHERE remove_transaction_id IS NULL" |
|
;; Special treatment for the language_id here. We don't add it to this entity |
|
;; but instead add it to the :ow.text/id itself. Much better. |
|
true {:language_id (constantly nil) |
|
:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 5 (es-by-a :ow.tc/id)) |
|
|
|
;; NOTE: Manually query the language_id and add a lang ref to the :ow.text/id entity. |
|
;; 5sec |
|
(t! |
|
(mapv |
|
(fn [{:keys [language_id text_id]}] |
|
{:ow.text/id text_id |
|
:ow.text/lang {:ow.lang/id language_id}}) |
|
(q (str "select language_id, text_id from uw_translated_content" |
|
" where language_id is not null")))) |
|
|
|
;; 600 -> 0s |
|
(transact-table "uw_translated_content_attribute_values" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 5 (es-by-a :ow.trans-content.attr/id)) |
|
|
|
;; 9k -> 1s |
|
(transact-table "uw_url_attribute_values" |
|
"WHERE remove_transaction_id IS NULL" |
|
true {:remove_transaction_id (constantly nil) |
|
:add_transaction_id (constantly nil)}) |
|
(take 5 (es-by-a :ow.url/id))) |
|
#_(time (import-all)) |
|
|
|
(def possible-backrefs |
|
"All (delay) collection of all backrefs (:foo.bar/_xyz) keywords of the datomic db" |
|
(delay |
|
(keep |
|
(fn [{:db/keys [ident valueType]}] |
|
(when (and (.startsWith (str ident) ":ow.") |
|
(= valueType :db.type/ref)) |
|
(keyword (namespace ident) (str "_" (name ident))))) |
|
(es-by-a :db/ident)))) |
|
|
|
(defn all-backrefs |
|
"Finds all entities that ref to this entity. |
|
Used for exploration. |
|
|
|
Best to use like: |
|
|
|
(let [ent (some-entity)] |
|
(all-backrefs ent) |
|
ent)" |
|
[ent] |
|
(into [] |
|
(keep (fn [backref] |
|
(when-let [ents (backref ent)] |
|
(mapv t ents)))) |
|
@possible-backrefs)) |
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
;; LANGUAGE HELPERS: |
|
(defn is-lang? |
|
[lang iso1] |
|
(= iso1 (:ow.lang/iso1 lang))) |
|
#_(is-lang? (e-by-av :ow.lang/iso3 "deu") "de") |
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
;; EXPRESSION: |
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
(defn syntr-for-expr |
|
"Returns the syntranses for the given expression" |
|
[expr] |
|
(mapv t (:ow.syntrans/_expr expr))) |
|
|
|
(defn exprs-by-spell |
|
"Expressions by exact spelling. Will return the expressions in any language it exists. |
|
Or can be limited to a certain language." |
|
([spell] |
|
(es-by-av :ow.expr/spelling spell)) |
|
([spell iso1] |
|
(filter (comp #(is-lang? % iso1) :ow.expr/lang) (exprs-by-spell spell)))) |
|
#_(exprs-by-spell "Unfall") |
|
#_(exprs-by-spell "Unfall" "de") |
|
|
|
(defn langs-for-expr |
|
"Returns the collection of languages that this expression is defined for. |
|
Exact match for spelling!" |
|
[spell] |
|
(mapv (comp tc :ow.expr/lang) (exprs-by-spell spell))) |
|
#_(langs-for-expr "neu") |
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
;; TRANSLATED CONTENT: |
|
(defn tc-lang? |
|
"If the translated content is of language lang." |
|
[tc lang] |
|
(is-lang? (:ow.text/lang tc) lang)) |
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
;; SYNTRANS |
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
(defn expr-if-syntr-is-lang |
|
"Returns the expression of the syntrans it has the given language" |
|
[syntr iso1] |
|
(let [{:ow.expr/keys [lang] :as expr} (:ow.syntrans/expr syntr)] |
|
(when (is-lang? lang iso1) |
|
(t expr)))) |
|
|
|
(declare options dm-guess-pos) |
|
(defn noun? |
|
"Checks if the syntrans is a noun. If no options present for the syntrans |
|
we fall back to the defined meaning. |
|
Returns nil if we don't know (ie we have no part of speech info)" |
|
[syntrans dm] |
|
(let [PoS (or (:PoS (options syntrans false "en")) |
|
(dm-guess-pos dm "en"))] |
|
(when PoS |
|
(= "noun" PoS)))) |
|
#_(let [dm (dm-by-spell-1 "intelligent")] |
|
(noun? (first (syntr-for-dm dm "en")) dm)) |
|
|
|
(declare syntr-for-dm dm-by-spell-1) |
|
(defn ex-syntr |
|
"DEV HELPER: A single example syntrans." |
|
[text iso1] |
|
(first |
|
(sort-by :ow.syntrans/id (syntr-for-dm (dm-by-spell-1 text) iso1)))) |
|
#_(ex-syntr "Blei" "de") |
|
|
|
(defn syntr-for-dm |
|
"Returns the syntranses that have the given DM. |
|
Probably more interesting is the expr-for-dm function!" |
|
([dm] |
|
(mapv t (:ow.syntrans/_meaning dm))) |
|
([dm iso1] |
|
(into [] |
|
(comp |
|
(filter #(expr-if-syntr-is-lang % iso1)) |
|
(map t)) |
|
(:ow.syntrans/_meaning dm)))) |
|
#_(syntr-for-dm (dm-by-spell-1 "hmm")) |
|
#_(syntr-for-dm (dm-by-spell-1 "hmm") "de") |
|
#_(syntr-for-dm (dm-by-spell-1 "dog") "de") |
|
#_(syntr-for-dm (dm-by-spell-1 "encyclopedia") "en") ;; Def in both, en-US + en-GB |
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
;; DEFINED MEANING: |
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
(defn dm-by-spell |
|
"Returns all defined meanings for the exact spelling" |
|
[spell] |
|
;; The expr in similar language might refer to the SAME DM, so we make them distinct: |
|
(into #{} |
|
(comp |
|
(mapcat syntr-for-expr) |
|
(map (comp t :ow.syntrans/meaning))) |
|
(exprs-by-spell spell))) |
|
#_(dm-by-spell "Unfall") |
|
|
|
(defn dm-by-spell-1 |
|
"DEV HELPER: Returns dm with the lowest id (thus hopefully the most common definition)" |
|
[spell] |
|
(first (sort-by :ow.meaning/id (dm-by-spell spell)))) |
|
|
|
(defn dm-text |
|
"The TEXT of the defined meaning in the given language. Can be nil. |
|
This is the \"long\" description sentence that explains the meaning. |
|
Eg, on meaning of lead: |
|
|
|
\"A small stick of graphite used in pencil that leaves marks when rubbed against a surface.\" |
|
|
|
Defaults to english." |
|
([dm] (dm-text dm "en")) |
|
([dm lang] |
|
(t (ffilter (fn [tc] (tc-lang? tc lang)) |
|
(:ow.tc/text (:ow.meaning/text dm)))))) |
|
#_(keep #(dm-text % "es") (dm-by-spell "Unfall")) |
|
#_(keep dm-text (dm-by-spell "great")) |
|
#_(keep dm-text (dm-by-spell "lead")) |
|
|
|
(defn obj-coll-for-dm |
|
"Returns the object collections that have the given DM" |
|
[dm] |
|
(:ow.coll.object/_meaning dm)) |
|
#_(mapv tc (obj-coll-for-dm (dm-by-spell-1 "Unfall"))) |
|
|
|
(defn exprs-for-dm |
|
"Returns the syntrans's expressions that have the given DM. |
|
This is more interesting since the syntr-for-dm gives no new info. |
|
|
|
- If given a language will return only expressions in that language." |
|
([dm] |
|
(mapv (comp t :ow.syntrans/expr) (:ow.syntrans/_meaning dm))) |
|
([dm iso1] |
|
(keep #(expr-if-syntr-is-lang % iso1) (:ow.syntrans/_meaning dm)))) |
|
#_(exprs-for-dm (dm-by-spell-1 "hmm")) |
|
#_(exprs-for-dm (dm-by-spell-1 "hmm") "de") |
|
|
|
(defn expr-for-dm |
|
"Returns a single expression for the given defined meaning in the given language. |
|
|
|
The one with the lowest :ow/expr/id is returned! |
|
|
|
(We hope that this is the most common one since it was added to the db early on)" |
|
([dm iso1] |
|
(first |
|
(sort-by |
|
:ow.expr/id |
|
(exprs-for-dm dm iso1))))) |
|
#_(expr-for-dm (dm-by-spell-1 "hmm") "de") |
|
#_(expr-for-dm (dm-by-spell-1 "accident") "de") |
|
|
|
;; Not really used: |
|
(defn rel1-for-dm |
|
[dm] |
|
(:ow.relation/_meaning1 dm)) |
|
#_(rel1-for-dm (dm-by-spell-1 "Unfall")) |
|
|
|
(defn rel2-for-dm |
|
[dm] |
|
(:ow.relation/_meaning2 dm)) |
|
#_(rel2-for-dm (dm-by-spell-1 "Unfall")) |
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
;; OBJECTS and OPTIONS |
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
(defn options-for-syntr |
|
"Returns the options for the given syntrans." |
|
[synt] |
|
;; First we get the objects that refer to this syntrans: |
|
(let [objects (mapv tc (:ow.attr.val/_object synt))] |
|
;; Next we get the options that point to this object: |
|
(mapv (comp tc :ow.attr.val/option) objects))) |
|
#_(options-for-syntr (ex-syntr "Blei" "de")) |
|
|
|
(defn option-meaning |
|
"Gets the options value defined meaning. |
|
(eg noun)" |
|
[opt] |
|
(-> opt :ow.option/meaning tc)) |
|
(comment |
|
(mapv option-meaning (options-for-syntr (ex-syntr "Blei" "de")))) |
|
|
|
(defn option-attr-dm |
|
"Gets the options attribute's defined meaning |
|
(eg \"part of speech\" or \"Genus\")" |
|
[opt] |
|
(-> opt :ow.option/attribute :ow.attr/attr tc)) |
|
#_(mapv option-attr-dm (options-for-syntr (ex-syntr "Blei" "de"))) |
|
|
|
(defn map-dm-to-keyword |
|
"Some defined meanings (Such as part of speech or genus) |
|
that are of interest can be mappped to keywords. |
|
If nothing is found, returns nil." |
|
[dm] |
|
(case (long (:ow.meaning/id dm)) |
|
358760 :PoS |
|
423236 :genus |
|
nil)) |
|
|
|
(defn options |
|
"Main Options API |
|
Returns a map of options for the given syntrans: |
|
|
|
expr -> expr |
|
|
|
for the given syntrans and the iso1 code. |
|
|
|
- if kw-keys is true the keys of the options are attempted to be translated to keywords |
|
if not found the original meaning is used instead. |
|
|
|
- genus option in english is: |
|
o female |
|
o neuter |
|
o masculine |
|
|
|
Similar for kw-vals." |
|
[syntr & [expr? val-iso1]] |
|
(let [val-iso1 (or val-iso1 "en") |
|
options (options-for-syntr syntr)] |
|
(zipmap |
|
(mapv (comp (fn [dm] (or (map-dm-to-keyword dm) dm)) |
|
option-attr-dm) |
|
options) |
|
(mapv (comp (if expr? |
|
#(expr-for-dm % val-iso1) |
|
(fn [dm] |
|
(:ow.expr/spelling (expr-for-dm dm val-iso1)))) |
|
:ow.option/meaning) |
|
options)))) |
|
(comment |
|
(let [word "sun", lang "de"] |
|
(list |
|
(options (ex-syntr word lang) false "de") |
|
(options (ex-syntr word lang))))) |
|
|
|
(defn dm-guess-pos |
|
"So the part of speech is only set of syntrans, which kinda not makes sense. |
|
The word for \"sun\" is really a noun in every language. |
|
Though Omegawiki has other attributes on that level (syntrans level) since |
|
they're different among languages. For instance, in some language \"sun\" is |
|
neutrum, maskuline or doesn't even have a gender. |
|
|
|
This function gets ALL the part of speech tags of all the syntrans's of the |
|
given defined meaning and takes the most common one. |
|
|
|
Returns the string (or nil) of the part of speech in language iso1." |
|
[dm iso1] |
|
(some->> (syntr-for-dm dm) |
|
(into [] |
|
(comp (keep (comp not-empty #(options % false iso1))) |
|
(map :PoS))) |
|
frequencies |
|
seq |
|
(apply max-key second) |
|
first)) |
|
#_(dm-guess-pos (dm-by-spell-1 "Sun") "en") |
|
#_(dm-guess-pos (dm-by-spell-1 "pretty") "es") |