Skip to content

Instantly share code, notes, and snippets.

@rauhs
Last active November 1, 2021 04:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rauhs/309bbd6c270bd8608f0689d8a826c223 to your computer and use it in GitHub Desktop.
Save rauhs/309bbd6c270bd8608f0689d8a826c223 to your computer and use it in GitHub Desktop.

Synopsis

Problem: Omegawiki's database layout ( http://www.omegawiki.org/Help:OmegaWiki_database_layout ) is incredibly complex and makes it tough to query the db.

This quick'n'dirty script imports the data into datomic. It aims to fully connect them with refs so that you can walk along the entities just like clojure maps.

Once in Datomic the entire db is in memory and it's very easy to explore the db and also very fast.

Thanks Datomic.

See the examples.clj file in this gist for (a tiny fraction) of what omegawiki has to offer.

(take 7 (fulltext :ow.expr/spelling "accident"))
=>
({:db/id 17592187572425,
:ow.expr/id 364150,
:ow.expr/spelling "accident",
:ow.expr/lang #:db{:id 17592187389877}}
{:db/id 17592187477622,
:ow.expr/id 8775,
:ow.expr/spelling "accident",
:ow.expr/lang #:db{:id 17592187389832}}
{:db/id 17592186790577,
:ow.expr/id 1055363,
:ow.expr/spelling "accident",
:ow.expr/lang #:db{:id 17592187389835},
:ow.object/id 1055363}
{:db/id 17592187416279,
:ow.expr/id 8764,
:ow.expr/spelling "accident",
:ow.expr/lang #:db{:id 17592187389831}}
{:db/id 17592187424597,
:ow.expr/id 98420,
:ow.expr/spelling "work accident",
:ow.expr/lang #:db{:id 17592187389831}}
{:db/id 17592187649634,
:ow.expr/id 668309,
:ow.expr/spelling "accident cérébrovasculaire",
:ow.expr/lang #:db{:id 17592187389832}}
{:db/id 17592186863035,
:ow.expr/id 1127840,
:ow.expr/spelling "bike accident",
:ow.expr/lang #:db{:id 17592187389831},
:ow.object/id 1127840})
(exprs-by-spell "Unfall")
=>
({:db/id 17592187274875,
:ow.expr/id 1539708,
:ow.expr/spelling "Unfall",
:ow.expr/lang #:db{:id 17592187389878},
:ow.object/id 1539708}
{:db/id 17592187477617,
:ow.expr/id 8768,
:ow.expr/spelling "Unfall",
:ow.expr/lang #:db{:id 17592187389847}})
(exprs-by-spell "Unfall" "de")
=>
({:db/id 17592187477617,
:ow.expr/id 8768,
:ow.expr/spelling "Unfall",
:ow.expr/lang {:db/id 17592187389847, :ow.lang/iso1 "de"}})
(langs-for-expr "neu")
=>
[{:db/id 17592187389862,
:ow.lang/id 117,
:ow.lang/iso2 "cat",
:ow.lang/iso3 "cat",
:ow.lang/wikimedia-key "ca",
:ow.lang/iso1 "ca"}
{:db/id 17592187389878, :ow.lang/id 134, :ow.lang/iso3 "ksh"}
{:db/id 17592187389847,
:ow.lang/id 101,
:ow.lang/iso2 "ger",
:ow.lang/iso3 "deu",
:ow.lang/wikimedia-key "de",
:ow.lang/iso1 "de"}]
(syntr-for-dm (dm-by-spell-1 "dog") "de")
=>
[{:db/id 17592187340161,
:ow.syntrans/expr {:db/id 17592187340160,
:ow.expr/lang {:db/id 17592187389847, :ow.lang/iso1 "de"},
:ow.expr/id 1604999,
:ow.expr/spelling "Töle",
:ow.object/id 1604999},
:ow.object/id 1605000,
:ow.syntrans/id 1605000,
:ow.syntrans/meaning #:db{:id 17592186046445},
:ow.syntrans/identical? true}
{:db/id 17592187340163,
:ow.syntrans/expr {:db/id 17592187340162,
:ow.expr/lang {:db/id 17592187389847, :ow.lang/iso1 "de"},
:ow.expr/id 1605001,
:ow.expr/spelling "Köter",
:ow.object/id 1605001},
:ow.object/id 1605002,
:ow.syntrans/id 1605002,
:ow.syntrans/meaning #:db{:id 17592186046445},
:ow.syntrans/identical? true}
{:db/id 17592186098959,
:ow.syntrans/expr {:db/id 17592187489423,
:ow.expr/lang {:db/id 17592187389847, :ow.lang/iso1 "de"},
:ow.expr/id 29432,
:ow.expr/spelling "Hund"},
:ow.object/id 206325,
:ow.syntrans/id 206325,
:ow.syntrans/meaning #:db{:id 17592186046445},
:ow.syntrans/identical? true}
{:db/id 17592187340164,
:ow.syntrans/expr {:db/id 17592187585037,
:ow.expr/lang {:db/id 17592187389847, :ow.lang/iso1 "de"},
:ow.expr/id 398242,
:ow.expr/spelling "Wauwau"},
:ow.object/id 1605003,
:ow.syntrans/id 1605003,
:ow.syntrans/meaning #:db{:id 17592186046445},
:ow.syntrans/identical? true}]
(syntr-for-dm (dm-by-spell-1 "encyclopedia") "en")
=>
[{:db/id 17592186306246,
:ow.syntrans/expr {:db/id 17592187607570,
:ow.expr/lang {:db/id 17592187389880, :ow.lang/iso1 "en"},
:ow.expr/id 472424,
:ow.expr/spelling "encyclopaedia"},
:ow.object/id 472425,
:ow.syntrans/id 472425,
:ow.syntrans/meaning #:db{:id 17592186046643},
:ow.syntrans/identical? true}
{:db/id 17592186103417,
:ow.syntrans/expr {:db/id 17592187491288,
:ow.expr/lang {:db/id 17592187389850, :ow.lang/iso1 "en"},
:ow.expr/id 33757,
:ow.expr/spelling "encyclopedia"},
:ow.object/id 210783,
:ow.syntrans/id 210783,
:ow.syntrans/meaning #:db{:id 17592186046643},
:ow.syntrans/identical? true}]
(keep #(dm-text % "es") (dm-by-spell "Unfall"))
=>
({:db/id 17592187771328,
:ow.text/lang {:db/id 17592187389833, :ow.lang/iso1 "es"},
:ow.text/id 230770,
:ow.text/text "Evento inesperado, falla o pérdida, potencialmente capaz de dañar vidas humanas, propiedades o el ambiente."})
(keep dm-text (dm-by-spell "lead"))
=>
({:db/id 17592187799896,
:ow.text/lang {:db/id 17592187389831, :ow.lang/iso1 "en"},
:ow.text/id 259946,
:ow.text/text "To treat with lead."}
{:db/id 17592187717122,
:ow.text/lang {:db/id 17592187389831, :ow.lang/iso1 "en"},
:ow.text/id 172940,
:ow.text/text "To be ahead of others, e.g., in a race."}
{:db/id 17592187723308,
:ow.text/lang {:db/id 17592187389831, :ow.lang/iso1 "en"},
:ow.text/id 180234,
:ow.text/text "The distance that a shooter aims ahead of a moving target in order to hit it with the projectile."}
{:db/id 17592187838664,
:ow.text/lang {:db/id 17592187389831, :ow.lang/iso1 "en"},
:ow.text/id 298721,
:ow.text/text "A small stick of graphite used in pencil that leaves marks when rubbed against a surface."}
{:db/id 17592187807567,
:ow.text/lang {:db/id 17592187389831, :ow.lang/iso1 "en"},
:ow.text/id 267617,
:ow.text/text "To move ahead (of others) in time or space."}
{:db/id 17592187837836,
:ow.text/lang {:db/id 17592187389831, :ow.lang/iso1 "en"},
:ow.text/id 297893,
:ow.text/text "A heavy toxic bluish-white metallic element with symbol Pb and atomic number 82 that is highly malleable; occurs principally as galena and is used in alloys, accumulators, cable sheaths, paints, and as a radiation shield."})
(exprs-for-dm (dm-by-spell-1 "accident"))
=>
[{:db/id 17592187477624,
:ow.expr/id 8777,
:ow.expr/spelling "incidente",
:ow.expr/lang #:db{:id 17592187389846}}
{:db/id 17592187477625,
:ow.expr/id 8779,
:ow.expr/spelling "ulykke",
:ow.expr/lang #:db{:id 17592187389841}}
{:db/id 17592187477621,
:ow.expr/id 8774,
:ow.expr/spelling "onnettomuudet",
:ow.expr/lang #:db{:id 17592187389845}}
{:db/id 17592187477617,
:ow.expr/id 8768,
:ow.expr/spelling "Unfall",
:ow.expr/lang #:db{:id 17592187389847}}
{:db/id 17592187477627,
:ow.expr/id 8782,
:ow.expr/spelling "авария",
:ow.expr/lang #:db{:id 17592187389834}}
{:db/id 17592187572426,
:ow.expr/id 364152,
:ow.expr/spelling "ongeluk",
:ow.expr/lang #:db{:id 17592187389835}}
{:db/id 17592186514400,
:ow.expr/id 774403,
:ow.expr/spelling "ავარია",
:ow.expr/lang #:db{:id 17592187389869},
:ow.object/id 774403}
{:db/id 17592187477626,
:ow.expr/id 8780,
:ow.expr/spelling "awaria",
:ow.expr/lang #:db{:id 17592187389839}}
{:db/id 17592187177719,
:ow.expr/id 1442549,
:ow.expr/spelling "ప్రమాదం",
:ow.expr/lang #:db{:id 17592187389901},
:ow.object/id 1442549}
{:db/id 17592187477616,
:ow.expr/id 8767,
:ow.expr/spelling "uheld",
:ow.expr/lang #:db{:id 17592187389849}}
{:db/id 17592187572420,
:ow.expr/id 364140,
:ow.expr/spelling "akcidento",
:ow.expr/lang #:db{:id 17592187389858}}
{:db/id 17592187477628,
:ow.expr/id 8783,
:ow.expr/spelling "havária",
:ow.expr/lang #:db{:id 17592187389843}}
{:db/id 17592187572425,
:ow.expr/id 364150,
:ow.expr/spelling "accident",
:ow.expr/lang #:db{:id 17592187389877}}
{:db/id 17592187416279,
:ow.expr/id 8764,
:ow.expr/spelling "accident",
:ow.expr/lang #:db{:id 17592187389831}}
{:db/id 17592187477618,
:ow.expr/id 8769,
:ow.expr/spelling "ατυχήματα",
:ow.expr/lang #:db{:id 17592187389851}}
{:db/id 17592187477620,
:ow.expr/id 8772,
:ow.expr/spelling "õnnetus",
:ow.expr/lang #:db{:id 17592187389844}}
{:db/id 17592186813200,
:ow.expr/id 1077986,
:ow.expr/spelling "accidente",
:ow.expr/lang #:db{:id 17592187389952},
:ow.object/id 1077986}
{:db/id 17592187275779,
:ow.expr/id 1540612,
:ow.expr/spelling "infortunio",
:ow.expr/lang #:db{:id 17592187389846},
:ow.object/id 1540612}
{:db/id 17592187318097,
:ow.expr/id 1582936,
:ow.expr/spelling "кырсык",
:ow.expr/lang #:db{:id 17592187389970},
:ow.object/id 1582936}
{:db/id 17592187572417,
:ow.expr/id 364134,
:ow.expr/spelling "ezbehar",
:ow.expr/lang #:db{:id 17592187389842}}
{:db/id 17592187274875,
:ow.expr/id 1539708,
:ow.expr/spelling "Unfall",
:ow.expr/lang #:db{:id 17592187389878},
:ow.object/id 1539708}
{:db/id 17592187161687,
:ow.expr/id 1426517,
:ow.expr/spelling "akabeenje",
:ow.expr/lang #:db{:id 17592187389998},
:ow.object/id 1426517}
{:db/id 17592187544961,
:ow.expr/id 134273,
:ow.expr/spelling "nehoda",
:ow.expr/lang #:db{:id 17592187389843}}
{:db/id 17592186790544,
:ow.expr/id 1055330,
:ow.expr/spelling "gwallzarvoud",
:ow.expr/lang #:db{:id 17592187389925},
:ow.object/id 1055330}
{:db/id 17592187572424,
:ow.expr/id 364148,
:ow.expr/spelling "事故",
:ow.expr/lang #:db{:id 17592187389857}}
{:db/id 17592187268315,
:ow.expr/id 1533148,
:ow.expr/spelling "disgrazia",
:ow.expr/lang #:db{:id 17592187389846},
:ow.object/id 1533148}
{:db/id 17592187614275,
:ow.expr/id 494989,
:ow.expr/spelling "គ្រោះត្នាក់",
:ow.expr/lang #:db{:id 17592187389883}}
{:db/id 17592187544721,
:ow.expr/id 133963,
:ow.expr/spelling "nesreča",
:ow.expr/lang #:db{:id 17592187389838}}
{:db/id 17592187560170,
:ow.expr/id 161879,
:ow.expr/spelling "վթար",
:ow.expr/lang #:db{:id 17592187389868}}
{:db/id 17592187281805,
:ow.expr/id 1546638,
:ow.expr/spelling "விபத்து",
:ow.expr/lang #:db{:id 17592187389907},
:ow.object/id 1546638}
{:db/id 17592187477623,
:ow.expr/id 8776,
:ow.expr/spelling "baleset",
:ow.expr/lang #:db{:id 17592187389848}}
{:db/id 17592186790577,
:ow.expr/id 1055363,
:ow.expr/spelling "accident",
:ow.expr/lang #:db{:id 17592187389835},
:ow.object/id 1055363}
{:db/id 17592187274873,
:ow.expr/id 1539706,
:ow.expr/spelling "Onfall",
:ow.expr/lang #:db{:id 17592187389878},
:ow.object/id 1539706}
{:db/id 17592186790579,
:ow.expr/id 1055365,
:ow.expr/spelling "malheur",
:ow.expr/lang #:db{:id 17592187389835},
:ow.object/id 1055365}
{:db/id 17592187477615,
:ow.expr/id 8766,
:ow.expr/spelling "havárie",
:ow.expr/lang #:db{:id 17592187389836}}
{:db/id 17592187542316,
:ow.expr/id 130911,
:ow.expr/spelling "olycka",
:ow.expr/lang #:db{:id 17592187389837}}
{:db/id 17592187572422,
:ow.expr/id 364144,
:ow.expr/spelling "kecelakaan",
:ow.expr/lang #:db{:id 17592187389908}}
{:db/id 17592187572418,
:ow.expr/id 364136,
:ow.expr/spelling "istripu",
:ow.expr/lang #:db{:id 17592187389842}}
{:db/id 17592187477619,
:ow.expr/id 8771,
:ow.expr/spelling "accidente",
:ow.expr/lang #:db{:id 17592187389833}}
{:db/id 17592186593375,
:ow.expr/id 853378,
:ow.expr/spelling "acidente",
:ow.expr/lang #:db{:id 17592187389840},
:ow.object/id 853378}
{:db/id 17592187477622,
:ow.expr/id 8775,
:ow.expr/spelling "accident",
:ow.expr/lang #:db{:id 17592187389832}}
{:db/id 17592187572415,
:ow.expr/id 364129,
:ow.expr/spelling "ongeval",
:ow.expr/lang #:db{:id 17592187389835}}
{:db/id 17592187572421,
:ow.expr/id 364142,
:ow.expr/spelling "acidento",
:ow.expr/lang #:db{:id 17592187389876}}
{:db/id 17592187274877,
:ow.expr/id 1539710,
:ow.expr/spelling "Onvall",
:ow.expr/lang #:db{:id 17592187389878},
:ow.object/id 1539710}
{:db/id 17592187572423,
:ow.expr/id 364146,
:ow.expr/spelling "musibah",
:ow.expr/lang #:db{:id 17592187389908}}
{:db/id 17592187650629,
:ow.expr/id 672568,
:ow.expr/spelling "аварія",
:ow.expr/lang #:db{:id 17592187389861}}
{:db/id 17592187572419,
:ow.expr/id 364138,
:ow.expr/spelling "Злополука",
:ow.expr/lang #:db{:id 17592187389830}}
{:db/id 17592187572416,
:ow.expr/id 364132,
:ow.expr/spelling "nezgoda",
:ow.expr/lang #:db{:id 17592187389838}}
{:db/id 17592186910691,
:ow.expr/id 1175499,
:ow.expr/spelling "malfelicxo",
:ow.expr/lang #:db{:id 17592187389858},
:ow.object/id 1175499}]
(expr-for-dm (dm-by-spell-1 "accident") "de")
=>
{:db/id 17592187477617,
:ow.expr/lang {:db/id 17592187389847, :ow.lang/iso1 "de"},
:ow.expr/id 8768,
:ow.expr/spelling "Unfall"}
(let [word "sun", lang "de"]
(list
(options (ex-syntr word lang) false "de")
(options (ex-syntr word lang))))
=> ({:PoS "Hauptwort", :genus "Maskulinum"} {:PoS "noun", :genus "masculine"})
(dm-guess-pos (dm-by-spell-1 "pretty") "es")
=> "adjetivo"
(dm-guess-pos (dm-by-spell-1 "Sun") "en")
=> "noun"
(ns omegawiki.datomic
"
Omegawiki data import to datomic
Needs about 800 MB memory when fully loaded. More during transacting!
# Notes:
We could unify :ow.expr/lang and :ow.option/lang (etc) to :ow/lang
-> Adv: Simpler walking the graph, same meaning
-> Disadv: A littler harder querying (no more e-by-av but real queries needed)."
(:require
[clojure.java.jdbc :as jdbc]
[datomic.api :as da]
[clojure.string :as str])
(:import
(datomic Entity)))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; MYSQL HELPERS
(defn stringify-bytes
[s]
(String. ^bytes s))
(defn update*
[m k & args]
(if-let [[k v] (find m k)]
(apply update m k args)
m))
(defn row-fn
[row]
(-> row
(update* :table keyword)
(update* :spelling stringify-bytes)
(update* :type_spelling stringify-bytes)
(update* :value_spelling stringify-bytes)
(update* :text stringify-bytes)
(update* :text_text stringify-bytes)
(update* :text_flags stringify-bytes)))
(defn q
"Queries the Omegawiki mysql db."
[& args]
(jdbc/query {:subprotocol "mysql"
:subname "//127.0.0.1:3306/ow"
:user "ow"
:password "foobarlol"}
(keep identity args)
{:row-fn row-fn}))
(defn ffilter
"First filter."
[f xs]
(first (filter f xs)))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; DATOMIC SETUP:
(def ^:const uri "datomic:mem://ow")
(def conn
(do
(da/create-database uri)
(da/connect uri)))
#_(da/delete-database uri)
(defn db [] (da/db conn))
(defn num-datoms
"Just for fun. ~9.2M Datoms"
[]
(count (seq (da/datoms (db) :eavt))))
(def object-table->ref-id
"Maps the objects in the ojbect table to the ids in datomic these map to."
{:uw_translated_content_attribute_values :ow.trans-content.attr/id,
:uw_class_attributes :ow.attr/id
:uw_class_membership :ow.class/id
:uw_text_attribute_values :ow.text.attr/id,
:uw_syntrans :ow.syntrans/id,
:uw_expression :ow.expr/id,
:uw_expression_ns nil, ;; Table doesn't exist anymore...
:uw_translated_content :ow.tc/id,
:uw_defined_meaning :ow.meaning/id,
:uw_meaning_relations :ow.relation/id,
:uw_collection :ow.coll/id,
:uw_option_attribute_options :ow.option/id,
:uw_url_attribute_values :ow.url/id,
:uw_option_attribute_values :ow.attr.val/id,
:translated_content nil ;; Table doesn't exist anymore?
:uw_collection_ns nil})
(defn t!
[tx]
@(da/transact conn tx))
(defn t
"Shorthand for touch"
[x]
(when (and x (instance? Entity x))
(da/touch x)))
(defn tc
"Shorthand for touch of all children"
[ent]
(when ent
(run! (comp t #(% ent)) (keys ent))
ent))
(defn es-by-a
"Returns all entities with the given attribute."
[a]
(let [db (db)]
(map (fn [[e]]
(t (da/entity db e)))
(da/datoms db :aevt a))))
#_(es-by-a :db/doc)
(defn es-by-av
"Returns all entities with the given attribute + value."
[a v]
(let [db (db)]
(map (fn [[e]]
(t (da/entity db e)))
(da/datoms db :avet a v))))
#_(es-by-av :foo 8)
(defn fulltext
[attr txt]
(let [db (db)]
(map (fn [[e score]]
(t (da/entity db e)))
(sort-by (comp - second)
(da/q '[:find ?e ?score
:in $ ?attr ?txt
:where [(fulltext $ ?attr ?txt) [[?e _ _ ?score]]]]
db attr txt)))))
#_(take 7 (fulltext :ow.expr/spelling "accident"))
(defn e-by-av
"An single entity for the given attr + value"
[a v]
(t (first (es-by-av a v))))
(defn e-by-a
"Eagerly returns all entities with the given attribute + value."
[a]
(t (first (es-by-a a))))
(defn entity
"Datomic entity with touch."
[eid]
(t (da/entity (db) eid)))
(defn ident->str
[kw]
(subs (str kw) 1))
;; Problem:
;; Omegawiki sometimes differentiates between US/UK english.
;; So lang/id 85 (normal "eng") might not be used for english expression and instead
;; "eng-UK" or "eng-US" is used (as ISO3)
(def iso3->iso1
"Note: This map is NOT invertable. We're mapping to the same ISO1 sometimes
(eg eng-US and eng-UK are both \"en\" in iso1)"
{"abk" "ab" "aar" "aa" "afr" "af" "aka" "ak" "sqi" "sq" "amh" "am" "ara" "ar" "arg" "an"
"hye" "hy" "asm" "as" "ava" "av" "ave" "ae" "aym" "ay" "aze" "az" "bam" "bm" "bak" "ba"
"eus" "eu" "bel" "be" "ben" "bn" "bih" "bh" "bis" "bi" "bos" "bs" "bre" "br" "bul" "bg"
"mya" "my" "cat" "ca" "cha" "ch" "che" "ce" "nya" "ny" "zho" "zh" "chv" "cv" "cor" "kw"
"cos" "co" "cre" "cr" "hrv" "hr" "ces" "cs" "dan" "da" "div" "dv" "nld" "nl" "dzo" "dz"
"eng" "en" "epo" "eo" "est" "et" "ewe" "ee" "fao" "fo" "fij" "fj" "fin" "fi" "fra" "fr"
"ful" "ff" "glg" "gl" "kat" "ka" "deu" "de" "ell" "el" "grn" "gn" "guj" "gu" "hat" "ht"
"hau" "ha" "heb" "he" "her" "hz" "hin" "hi" "hmo" "ho" "hun" "hu" "ina" "ia" "ind" "id"
"ile" "ie" "gle" "ga" "ibo" "ig" "ipk" "ik" "ido" "io" "isl" "is" "ita" "it" "iku" "iu"
"jpn" "ja" "jav" "jv" "kal" "kl" "kan" "kn" "kau" "kr" "kas" "ks" "kaz" "kk" "khm" "km"
"kik" "ki" "kin" "rw" "kir" "ky" "kom" "kv" "kon" "kg" "kor" "ko" "kur" "ku" "kua" "kj"
"lat" "la" "ltz" "lb" "lug" "lg" "lim" "li" "lin" "ln" "lao" "lo" "lit" "lt" "lub" "lu"
"lav" "lv" "glv" "gv" "mkd" "mk" "mlg" "mg" "msa" "ms" "mal" "ml" "mlt" "mt" "mri" "mi"
"mar" "mr" "mah" "mh" "mon" "mn" "nau" "na" "nav" "nv" "nde" "nd" "nep" "ne" "ndo" "ng"
"nob" "nb" "nno" "nn" "nor" "no" "iii" "ii" "nbl" "nr" "oci" "oc" "oji" "oj" "chu" "cu"
"orm" "om" "ori" "or" "oss" "os" "pan" "pa" "pli" "pi" "fas" "fa" "pol" "pl" "pus" "ps"
"por" "pt" "que" "qu" "roh" "rm" "run" "rn" "ron" "ro" "rus" "ru" "san" "sa" "srd" "sc"
"snd" "sd" "sme" "se" "smo" "sm" "sag" "sg" "srp" "sr" "gla" "gd" "sna" "sn" "sin" "si"
"slk" "sk" "slv" "sl" "som" "so" "sot" "st" "spa" "es" "sun" "su" "swa" "sw" "ssw" "ss"
"swe" "sv" "tam" "ta" "tel" "te" "tgk" "tg" "tha" "th" "tir" "ti" "bod" "bo" "tuk" "tk"
"tgl" "tl" "tsn" "tn" "ton" "to" "tur" "tr" "tso" "ts" "tat" "tt" "twi" "tw" "tah" "ty"
"uig" "ug" "ukr" "uk" "urd" "ur" "uzb" "uz" "ven" "ve" "vie" "vi" "vol" "vo" "wln" "wa"
"cym" "cy" "wol" "wo" "fry" "fy" "xho" "xh" "yid" "yi" "yor" "yo" "zha" "za" "zul" "zu"
;; Manually add some mappings: (TODO: Chinese mappings)
"eng-UK" "en"
"eng-US" "en"})
(defn t-datomic-schema!
"Transacts the datomic schema!"
[]
(let [attr (fn attr
[ident col doc]
{:db/id (ident->str ident) ;; We use string temp ids to ref in our tx
:db/ident ident
:db/cardinality :db.cardinality/one
:ow/columns {:db/id col
:ow/column col}
:db/doc doc})
many! #(assoc % :db/cardinality :db.cardinality/many)
fulltext! #(assoc % :db/fulltext true)
index! #(assoc % :db/index true)
component! #(assoc % :db/isComponent true)
id! #(assoc % :db/unique :db.unique/identity)
str! #(assoc % :db/valueType :db.type/string)
kw! #(assoc % :db/valueType :db.type/keyword)
bool! #(assoc % :db/valueType :db.type/boolean)
ref! (fn [m ident]
(cond-> (assoc m :db/valueType :db.type/ref)
ident (assoc :ow/refs (ident->str ident)))) ;; use string tempid
long! #(assoc % :db/valueType :db.type/long)]
;; Bootstrap the schema with a :ow/column so we can query the schema right away
(t! [{:db/ident :ow/column
:db/cardinality :db.cardinality/one
:db/unique :db.unique/identity
:db/valueType :db.type/string}
{:db/ident :ow/columns
:db/cardinality :db.cardinality/many
;:db/unique :db.unique/identity
:db/valueType :db.type/ref}
;; We also document which :db/ident a ref references.
;; We need this when transacting to wrap the transaction ref with that :db/ident:
{:db/ident :ow/refs
:db/cardinality :db.cardinality/one
:db/valueType :db.type/ref}])
(->>
[;; TABLE: language
#_(q "describe language")
#_(q "select * from language limit 10")
#_(q "select COUNT(*) from language")
(-> (attr :ow.lang/id "language.language_id"
"The language id. PK of language table.")
long! id!)
(-> (attr :ow.lang/dialect-of "language.dialect_of_lid"
"A ref to a language.")
(ref! :ow.lang/id))
(-> (attr :ow.lang/iso2 "language.iso639_2"
"The ISO 639-2 string. Sometimes not present! We can have the same ISO-2 Codes!!")
str! index!)
(-> (attr :ow.lang/iso1 "non-existing.iso1"
"The ISO 639-1 string. Sometimes not present! We can have the same ISO-1 Codes!!")
str! index!)
(-> (attr :ow.lang/iso3 "language.iso639_3"
"The ISO 639-3 string. Distinct.")
str! id!)
(-> (attr :ow.lang/wikimedia-key "language.wikimedia_key"
"Sometimes NOT present!")
str! id!)
;; language_names TABLE:
;; Not needed. We're fine with only english language names...
;; uw_alt_meaningtexts TABLE:
;; Not needed? Only ~450 entries.
;; uw_bootstrapped_defined_meanings TABLE:
;; Not needed. We can use idents for these 5 entries if needed.
;; TABLE: uw_class_attributes
(-> (attr :ow.attr/id "uw_class_attributes.object_id"
"The object id. PK of table.")
long! id!)
(-> (attr :ow.attr/meaning "uw_class_attributes.class_mid"
"A ref to a meaning.")
(ref! :ow.meaning/id))
(-> (attr :ow.attr/level "uw_class_attributes.level_mid"
"A ref to a meaning.")
(ref! :ow.meaning/id))
(-> (attr :ow.attr/attr "uw_class_attributes.attribute_mid"
"A ref to a meaning.")
(ref! :ow.meaning/id))
(-> (attr :ow.attr/type "uw_class_attributes.attribute_type"
"Either :text, :optn, :dm, :url or :trns")
kw! index!)
;; TABLE: uw_class_membership
(-> (attr :ow.class/id "uw_class_membership.class_membership_id"
"The PK? (but not really?)")
long! id!)
(-> (attr :ow.class/meaning "uw_class_membership.class_mid"
"Identifies the class (eg. \"animal\")")
(ref! :ow.meaning/id))
(-> (attr :ow.class/member "uw_class_membership.class_member_mid"
"Identifies the member (eg . \"dog\")")
(ref! :ow.meaning/id))
;; TABLE: uw_collection
(-> (attr :ow.coll/id "uw_collection.collection_id"
"The PK? (but not really?)")
long! id!)
(-> (attr :ow.coll/meaning "uw_collection.collection_mid"
"A ref to the meaning")
(ref! :ow.meaning/id))
(-> (attr :ow.coll/type "uw_collection.collection_type"
"A keyword, either :CLAS or :LANG")
kw! index!)
;; TABLE: uw_collection_contents
(-> (attr :ow.coll.object/id "uw_collection_contents.object_id"
"ALWAYS NIL! HMMM")
long! id!)
(-> (attr :ow.coll.object/coll "uw_collection_contents.collection_id"
"A ref to the collection")
(ref! :ow.coll/id))
(-> (attr :ow.coll.object/meaning "uw_collection_contents.member_mid"
"A ref to the meaning")
(ref! :ow.meaning/id))
;; TABLE: uw_collection_language
;; => EMPTY ignore
;; TABLE: uw_defined_meaning
#_(q "select * from uw_defined_meaning LIMIT 20")
(-> (attr :ow.meaning/id "uw_defined_meaning.defined_meaning_id"
"The meaning id. PK of table.")
long! id!)
(-> (attr :ow.meaning/expr "uw_defined_meaning.expression_id"
"A ref to the expression")
(ref! :ow.expr/id))
(-> (attr :ow.meaning/text "uw_defined_meaning.meaning_text_tcid"
"A ref to the translatable content")
(ref! :ow.tc/id))
;; TABLE: uw_expression
#_(q "select * from uw_expression limit 10")
(-> (attr :ow.expr/id "uw_expression.expression_id"
"The expression id. PK of table.")
long! id!)
(-> (attr :ow.expr/spelling "uw_expression.spelling"
"The actual text")
str! fulltext! index!)
(-> (attr :ow.expr/lang "uw_expression.language_id"
"A ref to a language")
(ref! :ow.lang/id))
;; TABLE: uw_meaning_relations
#_(q "select * from uw_meaning_relations limit 10")
#_(q "select DISTINCT(relationtype_mid) from uw_meaning_relations limit 10")
(-> (attr :ow.relation/id "uw_meaning_relations.relation_id"
"PK of table.")
long! id!)
(-> (attr :ow.relation/meaning1 "uw_meaning_relations.meaning1_mid"
"A ref to a meaning")
(ref! :ow.meaning/id))
(-> (attr :ow.relation/meaning2 "uw_meaning_relations.meaning2_mid"
"A ref to a meaning")
(ref! :ow.meaning/id))
;; TODO: Or to a attribute? I don't use this though...
(-> (attr :ow.relation/type "uw_meaning_relations.relationtype_mid"
"A ref to defined meaning")
(ref! :ow.meaning/id))
;; TABLE: uw_objects
;; TODO: This table is just aweful. It ref's rows in various tables
;; We should hand transact this and create the refs to the table-objects
;; We manually transform this table. Datomic can ref anything and is flexible...
#_(q "select * from uw_objects limit 10")
(-> (attr :ow.object/id "uw_objects.object_id"
"PK of table.")
long! id!)
;; Still add table so we know what kind of ref this is.
;; Better would be to do different refs like:
;; - :ow/syntrans
;; - :ow/expr .... etc.
#_(-> (attr :ow.object/table "uw_objects.table"
"The table keyword")
index! kw!)
#_(-> (attr :ow.object/ref "doesnt.exist.xyz"
"The table keyword")
;; What it refs depends on the table and is looked up with the map:
;; object-table->ref-id (see above)
(ref! nil))
;; TABLE: uw_option_attribute_options
#_(q "select * from uw_option_attribute_options limit 10")
#_(q "select COUNT(*) from uw_option_attribute_options limit 10")
(-> (attr :ow.option/id "uw_option_attribute_options.option_id"
"PK of table.")
long! id!)
(-> (attr :ow.option/attribute "uw_option_attribute_options.attribute_id"
"A ref to an object")
(ref! :ow.object/id))
(-> (attr :ow.option/meaning "uw_option_attribute_options.option_mid"
"A ref to an defined meaning")
(ref! :ow.meaning/id))
(-> (attr :ow.option/lang "uw_option_attribute_options.language_id"
"A ref to a language")
(ref! :ow.lang/id))
;; TABLE: uw_option_attribute_values
#_(q "describe uw_option_attribute_values")
#_(q "select * from uw_option_attribute_values limit 10")
#_(q "select COUNT(*) from uw_option_attribute_values")
(-> (attr :ow.attr.val/id "uw_option_attribute_values.value_id"
"PK of table.")
long! id!)
;; NOTE: This also refers to syntrans id's (when used for Part of speech, genus).
(-> (attr :ow.attr.val/object "uw_option_attribute_values.object_id"
"Ref to an object")
(ref! :ow.object/id))
(-> (attr :ow.attr.val/option "uw_option_attribute_values.option_id"
"Ref to an option")
(ref! :ow.option/id))
;; TABLE: uw_syntrans
#_(q "describe uw_syntrans")
#_(q "select * from uw_syntrans limit 10")
#_(q "select COUNT(*) from uw_syntrans") ;; 680k
(-> (attr :ow.syntrans/id "uw_syntrans.syntrans_sid"
"PK of table.")
long! id!)
(-> (attr :ow.syntrans/meaning "uw_syntrans.defined_meaning_id"
"Ref to a meaning")
(ref! :ow.meaning/id))
(-> (attr :ow.syntrans/expr "uw_syntrans.expression_id"
"Ref to an expression")
(ref! :ow.expr/id))
(-> (attr :ow.syntrans/identical? "uw_syntrans.identical_meaning"
"To avoid semantic drift. Specify if the translation is exact or approx.")
bool!)
;; TABLE: uw_text
#_(q "describe uw_text")
#_(q "select * from uw_text limit 10")
#_(q "select DISTINCT(text_flags) from uw_text") ;; always ""
#_(q "select COUNT(*) from uw_text") ;; 226k
(-> (attr :ow.text/id "uw_text.text_id"
"The PK.")
long! id!)
(-> (attr :ow.text/text "uw_text.text_text"
"The main text. The long description for the Defined Meaning. Also examples?")
str!)
(-> (attr :ow.text/lang "doens't exist"
;; SPECIAL! Manual
"A ref to a language")
(ref! :ow.lang/id))
;; text_flags is always "" so we omit it.
;; TABLE: uw_text_attribute_values
#_(q "describe uw_text_attribute_values")
#_(q "select * from uw_text_attribute_values limit 10")
#_(q "select COUNT(*) from uw_text_attribute_values") ;; 42k
(-> (attr :ow.text.attr/id "uw_text_attribute_values.value_id"
"The PK.")
long! id!)
(-> (attr :ow.text.attr/object "uw_text_attribute_values.object_id"
"ref to an object from objects table")
(ref! :ow.object/id))
(-> (attr :ow.text.attr/attr "uw_text_attribute_values.attribute_mid"
"ref to a meaning")
(ref! :ow.meaning/id))
(-> (attr :ow.text.attr/text "uw_text_attribute_values.text"
"The actual text string")
str!)
;; TABLE: uw_translated_content
#_(q "describe uw_translated_content")
#_(q "select * from uw_translated_content limit 10")
#_(q "select COUNT(*) from uw_translated_content") ;; 226k
;; NOTE: This table design is bad. A single :ow.text/id
;; has only ever ONE language. So we should add the language ref to the
;; :ow.text/id instead of in this table. We'll do this manually:
;; NOT IDEMPOTENT since we have a many ref here!
(-> (attr :ow.tc/id "uw_translated_content.translated_content_id"
"The translated content id. PK of table.")
long! id!)
;; Ignore the language_id and don't add it here but instead ot the
;; :ow.text/id entity itself!
(-> (attr :ow.tc/text "uw_translated_content.text_id"
"Many ref to a the texts")
(ref! :ow.text/id) many!)
;; TABLE: uw_translated_content_attribute_values
#_(q "describe uw_translated_content_attribute_values")
#_(q "select * from uw_translated_content_attribute_values limit 10")
#_(q "select COUNT(*) from uw_translated_content_attribute_values")
(-> (attr :ow.trans-content.attr/id "uw_translated_content_attribute_values.value_id"
"PK of table.")
long! id!)
(-> (attr :ow.trans-content.attr/object "uw_translated_content_attribute_values.object_id"
"Ref to an object")
(ref! :ow.object/id))
(-> (attr :ow.trans-content.attr/attr "uw_translated_content_attribute_values.attribute_mid"
"Ref to a meaning")
(ref! :ow.meaning/id))
(-> (attr :ow.trans-content.attr/tc "uw_translated_content_attribute_values.value_tcid"
"Ref to a translated content")
(ref! :ow.tc/id))
;; TABLE: uw_url_attribute_values
#_(q "describe uw_url_attribute_values")
#_(q "select * from uw_url_attribute_values limit 10")
#_(q "select DISTINCT(label) from uw_url_attribute_values limit 10")
#_(q "select COUNT(*) from uw_url_attribute_values")
(-> (attr :ow.url/id "uw_url_attribute_values.value_id"
"PK of table.")
long! id!)
(-> (attr :ow.url/object "uw_url_attribute_values.object_id"
"Ref to an object")
(ref! :ow.object/id))
(-> (attr :ow.url/attr "uw_url_attribute_values.attribute_mid"
"Ref to a meaning")
(ref! :ow.meaning/id))
(-> (attr :ow.url/url "uw_url_attribute_values.url"
"The actual URL")
str!)
(-> (attr :ow.url/label "uw_url_attribute_values.label"
"The link text for the URL")
str!)
;; TABLE: wikidata_sets
;; HMM doesn't exist???
#_(q "describe wikidata_sets")]
t!)))
(defn key-translator
"Given a single row of a query, will return a function that translates the keys
to the datomic keys."
[ex-map table strip-empty? transform]
(let [kv-mapper (reduce-kv
(fn [m k _]
(let [column (e-by-av :ow/column (str table "." (name k)))
schema-entries (:ow/_columns column)
_ (assert (or (nil? schema-entries) (= 1 (count schema-entries)))
(pr-str "HMM:" schema-entries (str table "." (name k))))
{:db/keys [ident], :ow/keys [refs]} (first schema-entries)]
(assoc m k
[ident (if refs
(fn [v] {refs v})
(fn [v] v))])))
{}
ex-map)]
(fn [m]
(into {}
(map (fn [[k v]]
(let [[mapped-key v-map] (get kv-mapper k)
v ((get transform k identity) v)]
(if (and strip-empty?
(or (nil? v) (= "" v)))
nil
[mapped-key (v-map v)]))))
m))))
(defn transact-table
"Transacts the table into datomic. If strip-emtpy? is true all NULL and empty strings
are not added to the entity (Done so we can still mark these attributes as :db.unique/..."
[table more strip-empty? transform]
(let [before (System/currentTimeMillis)
data (q (str "Select * from " table " " more))
tr (key-translator (first data) table strip-empty? transform)
tx-res (t! (mapv tr data))]
(println (str "Entities: " (count data)))
(println (str "Took: " (/ (- (System/currentTimeMillis) before) 1000.0) "s"))
;; Return an example entity:
(first (map tr data))
#_tx-res))
(defn import-all
"TAKES SOME TIME! (2-3 min)
Imports all the tables into datomic"
[]
;; Code is like this to use the REPL
;; We need to first transact the object table since we need both of the identity
;; attributes to belong to the same :db/id in Datomic:
;; Manual transacting since the object table ref's various entities depending on the
;; table. We take a simple approach here and simple add both, ow.xyz/id as well as
;; ow.object/id to the same entity. That way we safe one intermediate hop/entity which
;; adds nothing.
(t!
(into []
(keep
(fn [{:keys [object_id table]}]
(when-some [ref-ident (get object-table->ref-id table)]
;; We also associate the object id to the same entity.
;; That way we save "one hope"
{:ow.object/id object_id
ref-ident object_id})))
(q "select * from uw_objects")))
;; 500 -> 0s
(transact-table "language" nil
true
;; Not interested in the dialect_of data:
{:dialect_of_lid (constantly nil)})
(es-by-av :ow.lang/id 85) ;; Works English.
;; Manually add ISO1 version which I need:
(t! (reduce-kv
(fn [xs iso3 iso1]
(conj xs
{:ow.lang/iso3 iso3
:ow.lang/iso1 iso1}))
[]
iso3->iso1))
;; 300 -> 0s
(transact-table "uw_class_attributes"
"WHERE remove_transaction_id IS NULL"
true {:attribute_type keyword
:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(es-by-av :ow.attr/type :TEXT)
;; 11k -> 1s
(transact-table "uw_class_membership"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(e-by-a :ow.class/id)
;; 41 -> 0s
(transact-table "uw_collection"
"WHERE remove_transaction_id IS NULL"
true {:collection_type keyword
:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(e-by-a :ow.coll/id)
;; TODO: This is n-m relation without an id. We should ref-many the collection the meaning
;; 25k -> 2s
(transact-table "uw_collection_contents"
"WHERE remove_transaction_id IS NULL"
true {:internal_member_id (constantly nil) ;; Not interested...
:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
;; 55k -> 3s
(transact-table "uw_defined_meaning"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(e-by-a :ow.meaning/id)
;; 524k -> 26s
(transact-table "uw_expression"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 10 (es-by-a :ow.expr/id))
(take 5 (fulltext :ow.expr/spelling "house"))
;; 41k -> 3s
(transact-table "uw_meaning_relations"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 5 (es-by-a :ow.relation/id))
;; 700 -> 0s
(transact-table "uw_option_attribute_options"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 5 (es-by-a :ow.option/id))
;; 123k vals -> 5s
(transact-table "uw_option_attribute_values"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 5 (es-by-a :ow.attr.val/id))
;; 600k values -> 40s
(transact-table "uw_syntrans"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 5 (es-by-a :ow.syntrans/id))
;; 226k -> 5s
;; We add some language entires to these entities later on!
(transact-table "uw_text"
""
true {:text_flags (constantly nil)})
(take 5 (es-by-a :ow.text/id))
;; 42k -> 3s
(transact-table "uw_text_attribute_values"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 5 (es-by-a :ow.text.attr/id))
;; 173k -> 8s
;; NOT IDEMPOTENT!! We have ref-many attrs in this one!!!!!
(transact-table "uw_translated_content"
"WHERE remove_transaction_id IS NULL"
;; Special treatment for the language_id here. We don't add it to this entity
;; but instead add it to the :ow.text/id itself. Much better.
true {:language_id (constantly nil)
:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 5 (es-by-a :ow.tc/id))
;; NOTE: Manually query the language_id and add a lang ref to the :ow.text/id entity.
;; 5sec
(t!
(mapv
(fn [{:keys [language_id text_id]}]
{:ow.text/id text_id
:ow.text/lang {:ow.lang/id language_id}})
(q (str "select language_id, text_id from uw_translated_content"
" where language_id is not null"))))
;; 600 -> 0s
(transact-table "uw_translated_content_attribute_values"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 5 (es-by-a :ow.trans-content.attr/id))
;; 9k -> 1s
(transact-table "uw_url_attribute_values"
"WHERE remove_transaction_id IS NULL"
true {:remove_transaction_id (constantly nil)
:add_transaction_id (constantly nil)})
(take 5 (es-by-a :ow.url/id)))
#_(time (import-all))
(def possible-backrefs
"All (delay) collection of all backrefs (:foo.bar/_xyz) keywords of the datomic db"
(delay
(keep
(fn [{:db/keys [ident valueType]}]
(when (and (.startsWith (str ident) ":ow.")
(= valueType :db.type/ref))
(keyword (namespace ident) (str "_" (name ident)))))
(es-by-a :db/ident))))
(defn all-backrefs
"Finds all entities that ref to this entity.
Used for exploration.
Best to use like:
(let [ent (some-entity)]
(all-backrefs ent)
ent)"
[ent]
(into []
(keep (fn [backref]
(when-let [ents (backref ent)]
(mapv t ents))))
@possible-backrefs))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; LANGUAGE HELPERS:
(defn is-lang?
[lang iso1]
(= iso1 (:ow.lang/iso1 lang)))
#_(is-lang? (e-by-av :ow.lang/iso3 "deu") "de")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; EXPRESSION:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defn syntr-for-expr
"Returns the syntranses for the given expression"
[expr]
(mapv t (:ow.syntrans/_expr expr)))
(defn exprs-by-spell
"Expressions by exact spelling. Will return the expressions in any language it exists.
Or can be limited to a certain language."
([spell]
(es-by-av :ow.expr/spelling spell))
([spell iso1]
(filter (comp #(is-lang? % iso1) :ow.expr/lang) (exprs-by-spell spell))))
#_(exprs-by-spell "Unfall")
#_(exprs-by-spell "Unfall" "de")
(defn langs-for-expr
"Returns the collection of languages that this expression is defined for.
Exact match for spelling!"
[spell]
(mapv (comp tc :ow.expr/lang) (exprs-by-spell spell)))
#_(langs-for-expr "neu")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TRANSLATED CONTENT:
(defn tc-lang?
"If the translated content is of language lang."
[tc lang]
(is-lang? (:ow.text/lang tc) lang))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; SYNTRANS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defn expr-if-syntr-is-lang
"Returns the expression of the syntrans it has the given language"
[syntr iso1]
(let [{:ow.expr/keys [lang] :as expr} (:ow.syntrans/expr syntr)]
(when (is-lang? lang iso1)
(t expr))))
(declare options dm-guess-pos)
(defn noun?
"Checks if the syntrans is a noun. If no options present for the syntrans
we fall back to the defined meaning.
Returns nil if we don't know (ie we have no part of speech info)"
[syntrans dm]
(let [PoS (or (:PoS (options syntrans false "en"))
(dm-guess-pos dm "en"))]
(when PoS
(= "noun" PoS))))
#_(let [dm (dm-by-spell-1 "intelligent")]
(noun? (first (syntr-for-dm dm "en")) dm))
(declare syntr-for-dm dm-by-spell-1)
(defn ex-syntr
"DEV HELPER: A single example syntrans."
[text iso1]
(first
(sort-by :ow.syntrans/id (syntr-for-dm (dm-by-spell-1 text) iso1))))
#_(ex-syntr "Blei" "de")
(defn syntr-for-dm
"Returns the syntranses that have the given DM.
Probably more interesting is the expr-for-dm function!"
([dm]
(mapv t (:ow.syntrans/_meaning dm)))
([dm iso1]
(into []
(comp
(filter #(expr-if-syntr-is-lang % iso1))
(map t))
(:ow.syntrans/_meaning dm))))
#_(syntr-for-dm (dm-by-spell-1 "hmm"))
#_(syntr-for-dm (dm-by-spell-1 "hmm") "de")
#_(syntr-for-dm (dm-by-spell-1 "dog") "de")
#_(syntr-for-dm (dm-by-spell-1 "encyclopedia") "en") ;; Def in both, en-US + en-GB
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; DEFINED MEANING:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defn dm-by-spell
"Returns all defined meanings for the exact spelling"
[spell]
;; The expr in similar language might refer to the SAME DM, so we make them distinct:
(into #{}
(comp
(mapcat syntr-for-expr)
(map (comp t :ow.syntrans/meaning)))
(exprs-by-spell spell)))
#_(dm-by-spell "Unfall")
(defn dm-by-spell-1
"DEV HELPER: Returns dm with the lowest id (thus hopefully the most common definition)"
[spell]
(first (sort-by :ow.meaning/id (dm-by-spell spell))))
(defn dm-text
"The TEXT of the defined meaning in the given language. Can be nil.
This is the \"long\" description sentence that explains the meaning.
Eg, on meaning of lead:
\"A small stick of graphite used in pencil that leaves marks when rubbed against a surface.\"
Defaults to english."
([dm] (dm-text dm "en"))
([dm lang]
(t (ffilter (fn [tc] (tc-lang? tc lang))
(:ow.tc/text (:ow.meaning/text dm))))))
#_(keep #(dm-text % "es") (dm-by-spell "Unfall"))
#_(keep dm-text (dm-by-spell "great"))
#_(keep dm-text (dm-by-spell "lead"))
(defn obj-coll-for-dm
"Returns the object collections that have the given DM"
[dm]
(:ow.coll.object/_meaning dm))
#_(mapv tc (obj-coll-for-dm (dm-by-spell-1 "Unfall")))
(defn exprs-for-dm
"Returns the syntrans's expressions that have the given DM.
This is more interesting since the syntr-for-dm gives no new info.
- If given a language will return only expressions in that language."
([dm]
(mapv (comp t :ow.syntrans/expr) (:ow.syntrans/_meaning dm)))
([dm iso1]
(keep #(expr-if-syntr-is-lang % iso1) (:ow.syntrans/_meaning dm))))
#_(exprs-for-dm (dm-by-spell-1 "hmm"))
#_(exprs-for-dm (dm-by-spell-1 "hmm") "de")
(defn expr-for-dm
"Returns a single expression for the given defined meaning in the given language.
The one with the lowest :ow/expr/id is returned!
(We hope that this is the most common one since it was added to the db early on)"
([dm iso1]
(first
(sort-by
:ow.expr/id
(exprs-for-dm dm iso1)))))
#_(expr-for-dm (dm-by-spell-1 "hmm") "de")
#_(expr-for-dm (dm-by-spell-1 "accident") "de")
;; Not really used:
(defn rel1-for-dm
[dm]
(:ow.relation/_meaning1 dm))
#_(rel1-for-dm (dm-by-spell-1 "Unfall"))
(defn rel2-for-dm
[dm]
(:ow.relation/_meaning2 dm))
#_(rel2-for-dm (dm-by-spell-1 "Unfall"))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; OBJECTS and OPTIONS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defn options-for-syntr
"Returns the options for the given syntrans."
[synt]
;; First we get the objects that refer to this syntrans:
(let [objects (mapv tc (:ow.attr.val/_object synt))]
;; Next we get the options that point to this object:
(mapv (comp tc :ow.attr.val/option) objects)))
#_(options-for-syntr (ex-syntr "Blei" "de"))
(defn option-meaning
"Gets the options value defined meaning.
(eg noun)"
[opt]
(-> opt :ow.option/meaning tc))
(comment
(mapv option-meaning (options-for-syntr (ex-syntr "Blei" "de"))))
(defn option-attr-dm
"Gets the options attribute's defined meaning
(eg \"part of speech\" or \"Genus\")"
[opt]
(-> opt :ow.option/attribute :ow.attr/attr tc))
#_(mapv option-attr-dm (options-for-syntr (ex-syntr "Blei" "de")))
(defn map-dm-to-keyword
"Some defined meanings (Such as part of speech or genus)
that are of interest can be mappped to keywords.
If nothing is found, returns nil."
[dm]
(case (long (:ow.meaning/id dm))
358760 :PoS
423236 :genus
nil))
(defn options
"Main Options API
Returns a map of options for the given syntrans:
expr -> expr
for the given syntrans and the iso1 code.
- if kw-keys is true the keys of the options are attempted to be translated to keywords
if not found the original meaning is used instead.
- genus option in english is:
o female
o neuter
o masculine
Similar for kw-vals."
[syntr & [expr? val-iso1]]
(let [val-iso1 (or val-iso1 "en")
options (options-for-syntr syntr)]
(zipmap
(mapv (comp (fn [dm] (or (map-dm-to-keyword dm) dm))
option-attr-dm)
options)
(mapv (comp (if expr?
#(expr-for-dm % val-iso1)
(fn [dm]
(:ow.expr/spelling (expr-for-dm dm val-iso1))))
:ow.option/meaning)
options))))
(comment
(let [word "sun", lang "de"]
(list
(options (ex-syntr word lang) false "de")
(options (ex-syntr word lang)))))
(defn dm-guess-pos
"So the part of speech is only set of syntrans, which kinda not makes sense.
The word for \"sun\" is really a noun in every language.
Though Omegawiki has other attributes on that level (syntrans level) since
they're different among languages. For instance, in some language \"sun\" is
neutrum, maskuline or doesn't even have a gender.
This function gets ALL the part of speech tags of all the syntrans's of the
given defined meaning and takes the most common one.
Returns the string (or nil) of the part of speech in language iso1."
[dm iso1]
(some->> (syntr-for-dm dm)
(into []
(comp (keep (comp not-empty #(options % false iso1)))
(map :PoS)))
frequencies
seq
(apply max-key second)
first))
#_(dm-guess-pos (dm-by-spell-1 "Sun") "en")
#_(dm-guess-pos (dm-by-spell-1 "pretty") "es")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment