Created February 14, 2023 13:04
Clojure XPath
(set! *warn-on-reflection* true)
(require '[clojure.edn :as edn])
(require '[clojure.core.protocols :refer [Datafiable]])
(require '[clojure.datafy :as datafy])
(require '[ :as io])
(import '( StringReader PushbackReader))
(import '(java.time LocalDateTime))
(import '(java.time.format DateTimeFormatter))
(import '(javax.xml.namespace QName))
(import '(javax.xml.parsers DocumentBuilder DocumentBuilderFactory))
(import '(javax.xml.xpath XPath XPathConstants XPathExpression XPathFactory))
(import '(org.xml.sax EntityResolver InputSource))
(import '(org.w3c.dom Document Element NamedNodeMap Node NodeList Text))
(defn ^:private dtd-ignoring-document-builder
(.newDocumentBuilder (DocumentBuilderFactory/newDefaultInstance))
(reify EntityResolver
(resolveEntity [_ _ _]
(InputSource. (StringReader. "")))))))
(def ^XPathFactory ^:private xpath-factory
(def ^XPath ^:private xpath
(.newXPath xpath-factory))
(def ^DocumentBuilder ^:private document-builder
(def ^XPathExpression ^:private compile-xpath
(memoize #(.compile xpath %)))
(def ^:private return-types
{:boolean XPathConstants/BOOLEAN
:node XPathConstants/NODE
:nodeset XPathConstants/NODESET
:number XPathConstants/NUMBER
:string XPathConstants/STRING})
(defn ^:private node-seq
"Given an org.w3c.dom.NodeList, return a seq on the nodes in the node list."
[^NodeList nodelist]
(seq [_]
(let [len (.getLength nodelist)]
(loop [i 0 nodes []]
(if (= i len)
(recur (inc i) (conj nodes (.item nodelist i))))))))
(datafy [this]
(map datafy/datafy this))))
(defn query
"Given a node (org.w3c.dom.Node), an XPath expression (string), and,
optionally, a keyword that represents a return type (default: :nodeset),
return the result of evaluating the XPath expression on the node."
([node xpath-expr]
(query node xpath-expr :nodeset))
([^Node node xpath-expr return-type]
(cond-> (.evaluate (compile-xpath xpath-expr) node ^QName (return-types return-type))
(= :nodeset return-type) node-seq)))
(defn parse-string
"Given an XML string, return an org.w3c.dom.Document on the XML document in
the string."
^Document [s]
(with-open [reader (StringReader. s)]
(.parse document-builder (InputSource. reader))))
(query (parse-string "<a/>") "a")
(extend-protocol Datafiable
(datafy [this]
(datafy/datafy (.getDocumentElement this)))
(datafy [this]
(let [attrs (not-empty (datafy/datafy (.getAttributes this)))
content (node-seq (.getChildNodes this))]
(cond-> {:tag (.getNodeName this)}
attrs (assoc :attrs attrs)
(seq content) (assoc :content (mapv datafy/datafy content)))))
(datafy [this]
(into {}
(for [i (range (.getLength this))]
[(.getNodeName (.item this i)) (.getNodeValue (.item this i))])))
(datafy [this]
(.getWholeText this)))
(datafy/datafy (query (parse-string "<a/>") "a"))
(datafy/datafy (query (parse-string "<a>b</a>") "a"))
(datafy/datafy (query (parse-string "<a><b>1</b><c>2</c></a>") "a"))
(datafy/datafy (parse-string "<a>b</a>"))
(datafy/datafy (parse-string "<a b=\"c\">d</a>"))
(datafy/datafy (query (parse-string "<a b=\"c\" d=\"e\">f</a>") "a"))
(datafy/datafy (query (parse-string "<a><b>c</b><b>d</b></a>") "a/b/text()"))
(datafy/datafy (query (parse-string "<a><b>c</b><b>d</b></a>") "a"))
