Skip to content

Instantly share code, notes, and snippets.

@jbouwman
Last active September 20, 2022 17:34
Show Gist options
  • Save jbouwman/ac8b9b9f92b7a0ea9c56e5983ed1ae8d to your computer and use it in GitHub Desktop.
Save jbouwman/ac8b9b9f92b7a0ea9c56e5983ed1ae8d to your computer and use it in GitHub Desktop.
List all Wikidata Properties
(ns user
(:require [clojure.java.io :as io]
[hickory.core :as hickory])
(:import [java.net URL]))
(defn walk [value & [path]]
(letfn [(mapcat-indexed [f value]
(apply concat (map-indexed f value)))
(walk-map [i [k value]]
(walk value (conj (or path []) k)))
(walk-coll [i value]
(walk value (conj (or path []) i)))]
(cond (map? value)
(mapcat-indexed walk-map value)
(coll? value)
(mapcat-indexed walk-coll value)
:else
[[path value]])))
(def reader->walked (comp walk hickory/as-hiccup hickory/parse slurp))
(defn prefix? [path suffix]
(when (= suffix (drop (- (count path) (count suffix)) path))
(take (- (count path) (count suffix)) path)))
(defn find-records [rule data]
(->> data
(mapcat (fn [[path value]]
(reduce (fn [m [k suffix]]
(if-let [prefix (prefix? path suffix)]
(conj m [prefix k value])
m))
nil
rule)))
(group-by first)
vals
(map (fn [record]
(reduce (fn [m [_ k v]]
(assoc m k v))
{}
record)))
(filter (fn [record]
(= (keys record) (keys rule))))))
(defn scrape [reader rule]
(find-records rule (reader->walked reader)))
#_
(scrape (io/resource "properties.html")
{:id [3 2 2] :label [5 2]})
#_
(scrape (URL. "https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all")
{:id [3 2 2] :label [5 2]})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment