Skip to content

Instantly share code, notes, and snippets.

@eerohele
Created February 14, 2023 13:04
Show Gist options
  • Save eerohele/f3f51606cd62df7e7940f5f7ed0eac56 to your computer and use it in GitHub Desktop.
Save eerohele/f3f51606cd62df7e7940f5f7ed0eac56 to your computer and use it in GitHub Desktop.
Clojure XPath
(set! *warn-on-reflection* true)
(require '[clojure.edn :as edn])
(require '[clojure.core.protocols :refer [Datafiable]])
(require '[clojure.datafy :as datafy])
(require '[clojure.java.io :as io])
(import '(java.io StringReader PushbackReader))
(import '(java.time LocalDateTime))
(import '(java.time.format DateTimeFormatter))
(import '(javax.xml.namespace QName))
(import '(javax.xml.parsers DocumentBuilder DocumentBuilderFactory))
(import '(javax.xml.xpath XPath XPathConstants XPathExpression XPathFactory))
(import '(org.xml.sax EntityResolver InputSource))
(import '(org.w3c.dom Document Element NamedNodeMap Node NodeList Text))
(defn ^:private dtd-ignoring-document-builder
[]
(doto
(.newDocumentBuilder (DocumentBuilderFactory/newDefaultInstance))
(.setEntityResolver
(reify EntityResolver
(resolveEntity [_ _ _]
(InputSource. (StringReader. "")))))))
(def ^XPathFactory ^:private xpath-factory
(XPathFactory/newInstance))
(def ^XPath ^:private xpath
(.newXPath xpath-factory))
(def ^DocumentBuilder ^:private document-builder
(dtd-ignoring-document-builder))
(def ^XPathExpression ^:private compile-xpath
(memoize #(.compile xpath %)))
(def ^:private return-types
{:boolean XPathConstants/BOOLEAN
:node XPathConstants/NODE
:nodeset XPathConstants/NODESET
:number XPathConstants/NUMBER
:string XPathConstants/STRING})
(defn ^:private node-seq
"Given an org.w3c.dom.NodeList, return a seq on the nodes in the node list."
[^NodeList nodelist]
(reify
clojure.lang.Seqable
(seq [_]
(let [len (.getLength nodelist)]
(seq
(loop [i 0 nodes []]
(if (= i len)
nodes
(recur (inc i) (conj nodes (.item nodelist i))))))))
Datafiable
(datafy [this]
(map datafy/datafy this))))
(defn query
"Given a node (org.w3c.dom.Node), an XPath expression (string), and,
optionally, a keyword that represents a return type (default: :nodeset),
return the result of evaluating the XPath expression on the node."
([node xpath-expr]
(query node xpath-expr :nodeset))
([^Node node xpath-expr return-type]
(cond-> (.evaluate (compile-xpath xpath-expr) node ^QName (return-types return-type))
(= :nodeset return-type) node-seq)))
(defn parse-string
"Given an XML string, return an org.w3c.dom.Document on the XML document in
the string."
^Document [s]
(with-open [reader (StringReader. s)]
(.parse document-builder (InputSource. reader))))
(comment
(query (parse-string "<a/>") "a")
,,,)
(extend-protocol Datafiable
Document
(datafy [this]
(datafy/datafy (.getDocumentElement this)))
Element
(datafy [this]
(let [attrs (not-empty (datafy/datafy (.getAttributes this)))
content (node-seq (.getChildNodes this))]
(cond-> {:tag (.getNodeName this)}
attrs (assoc :attrs attrs)
(seq content) (assoc :content (mapv datafy/datafy content)))))
NamedNodeMap
(datafy [this]
(into {}
(for [i (range (.getLength this))]
[(.getNodeName (.item this i)) (.getNodeValue (.item this i))])))
Text
(datafy [this]
(.getWholeText this)))
(comment
(datafy/datafy (query (parse-string "<a/>") "a"))
(datafy/datafy (query (parse-string "<a>b</a>") "a"))
(datafy/datafy (query (parse-string "<a><b>1</b><c>2</c></a>") "a"))
(datafy/datafy (parse-string "<a>b</a>"))
(datafy/datafy (parse-string "<a b=\"c\">d</a>"))
(datafy/datafy (query (parse-string "<a b=\"c\" d=\"e\">f</a>") "a"))
(datafy/datafy (query (parse-string "<a><b>c</b><b>d</b></a>") "a/b/text()"))
(datafy/datafy (query (parse-string "<a><b>c</b><b>d</b></a>") "a"))
,,,)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment