Last active
September 20, 2022 17:34
-
-
Save jbouwman/ac8b9b9f92b7a0ea9c56e5983ed1ae8d to your computer and use it in GitHub Desktop.
List all Wikidata Properties
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns user | |
(:require [clojure.java.io :as io] | |
[hickory.core :as hickory]) | |
(:import [java.net URL])) | |
(defn walk [value & [path]] | |
(letfn [(mapcat-indexed [f value] | |
(apply concat (map-indexed f value))) | |
(walk-map [i [k value]] | |
(walk value (conj (or path []) k))) | |
(walk-coll [i value] | |
(walk value (conj (or path []) i)))] | |
(cond (map? value) | |
(mapcat-indexed walk-map value) | |
(coll? value) | |
(mapcat-indexed walk-coll value) | |
:else | |
[[path value]]))) | |
(def reader->walked (comp walk hickory/as-hiccup hickory/parse slurp)) | |
(defn prefix? [path suffix] | |
(when (= suffix (drop (- (count path) (count suffix)) path)) | |
(take (- (count path) (count suffix)) path))) | |
(defn find-records [rule data] | |
(->> data | |
(mapcat (fn [[path value]] | |
(reduce (fn [m [k suffix]] | |
(if-let [prefix (prefix? path suffix)] | |
(conj m [prefix k value]) | |
m)) | |
nil | |
rule))) | |
(group-by first) | |
vals | |
(map (fn [record] | |
(reduce (fn [m [_ k v]] | |
(assoc m k v)) | |
{} | |
record))) | |
(filter (fn [record] | |
(= (keys record) (keys rule)))))) | |
(defn scrape [reader rule] | |
(find-records rule (reader->walked reader))) | |
#_ | |
(scrape (io/resource "properties.html") | |
{:id [3 2 2] :label [5 2]}) | |
#_ | |
(scrape (URL. "https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all") | |
{:id [3 2 2] :label [5 2]}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment