Safe XPath Queries in Swift
/* | |
* XPathQuery.swift | |
* test_lbp | |
* | |
* Originally created by Matt Gallagher on 4/08/08. | |
* Heavily modified (conversion to swift, better libxml2 nodes to object | |
* conversion, etc.) by François Lamboley. | |
* | |
* Created by François Lamboley on 16/04/2017. | |
* Copyright © 2017 François Lamboley. All rights reserved. | |
*/ | |
import Foundation | |
/* Note: To be able to "import libxml2" just like that, create a | |
* module.modulemap file containing the following: | |
* module libxml2 [system] { | |
* header "/usr/include/libxml2/libxml/tree.h" | |
* header "/usr/include/libxml2/libxml/parser.h" | |
* header "/usr/include/libxml2/libxml/HTMLparser.h" | |
* header "/usr/include/libxml2/libxml/xpath.h" | |
* header "/usr/include/libxml2/libxml/xpathInternals.h" | |
* export * | |
* } | |
*/ | |
import libxml2 | |
struct XPathQueryUtils { | |
enum Error : Swift.Error { | |
case cannotCreateXMLDoc | |
case cannotCreateXPathContext | |
case cannotConvertQueryToCString | |
case cannotEvaluateExpression | |
case nilNodeSet | |
case textNodeWithNoContent | |
case cdataNodeWithNoContent | |
case elementNodeWithNoName | |
case invalidAttributeNodeInElement | |
case attributeNodeWithNoName | |
case attributeNodeWithNoContent | |
case invalidContentNodeInElement | |
case invalidUTF8 | |
} | |
enum LibXML2Node { | |
case element(name: String, attributes: [LibXML2AttributeNode], children: [LibXML2Node]) | |
case attribute(LibXML2AttributeNode) | |
case text(String) /* libxml2 is written in a way which makes even text nodes susceptible to have children nodes. We assume they won't. */ | |
case cdata(Data) /* libxml2 is written in a way which makes even CData nodes susceptible to have children nodes. We assume they won't. */ | |
case other(type: xmlElementType, name: String?, value: Data?, children: [LibXML2Node]) /* We only support conversion for above types. We could add more if needed. Also, we assume other types don't have attributes (they currently don't at least). */ | |
} | |
struct LibXML2AttributeNode { | |
var name: String | |
var value: String | |
} | |
/** Takes an array of LibXML2Nodes and outputs a string. Only .text and | |
.cdata nodes are considered. */ | |
static func textFrom(nodeList: [LibXML2Node]) throws -> String { | |
var res = String() | |
for n in nodeList { | |
switch n { | |
case .text(let str): | |
res += str | |
case .cdata(let data): | |
guard let str = String(data: data, encoding: .utf8) else {throw Error.invalidUTF8} | |
res += str | |
default: (/*nop(ignored)*/) | |
} | |
} | |
return res | |
} | |
static func dictionaryFrom(attributeNodesList: [LibXML2AttributeNode]) -> [String: String] { | |
var res = [String: String]() | |
for a in attributeNodesList {res[a.name] = a.value} | |
return res | |
} | |
static func performXMLXPathQuery(_ query: String, withDocument doc: Data) throws -> [LibXML2Node] { | |
return try doc.withUnsafeBytes { (bytes: UnsafePointer<Int8>) -> [LibXML2Node] in | |
guard let doc = xmlReadMemory(bytes, Int32(doc.count), "", nil, Int32(XML_PARSE_RECOVER.rawValue)) else {throw Error.cannotCreateXMLDoc} | |
defer {xmlFreeDoc(doc)} | |
return try performXPathQuery(query, withDocument: doc) | |
} | |
} | |
static func performHTMLXPathQuery(_ query: String, withDocument doc: Data) throws -> [LibXML2Node] { | |
return try doc.withUnsafeBytes { (bytes: UnsafePointer<Int8>) -> [LibXML2Node] in | |
guard let doc = htmlReadMemory(bytes, Int32(doc.count), "", nil, Int32(HTML_PARSE_NOWARNING.rawValue | HTML_PARSE_NOERROR.rawValue)) else {throw Error.cannotCreateXMLDoc} | |
defer {xmlFreeDoc(doc)} | |
return try performXPathQuery(query, withDocument: doc) | |
} | |
} | |
/* *************** | |
MARK: - Private | |
*************** */ | |
private static func dataFromXmlCharPtr(_ ptr: UnsafePointer<xmlChar>) -> Data { | |
/* ptr is a pointer to xmlChar (aka. UInt8). | |
* We convert it to an opaque pointer to retrieve un unsafe pointer raw | |
* pointer. | |
* | |
* Note: We can use strlen because 0x00 is an invalid byte in an XML doc. */ | |
let rawPtr = UnsafeRawPointer(OpaquePointer(ptr)) | |
return Data(bytes: rawPtr, count: Int(strlen(rawPtr.assumingMemoryBound(to: Int8.self)))) | |
} | |
private static func stringFromXmlCharPtr(_ ptr: UnsafePointer<xmlChar>) throws -> String { | |
/* ptr is a pointer to xmlChar (aka. UInt8). | |
* We convert it to an opaque pointer to retrieve un unsafe pointer to | |
* CChar (aka. Int8). */ | |
guard let ret = String(cString: UnsafePointer<CChar>(OpaquePointer(ptr)), encoding: .utf8) else {throw Error.invalidUTF8} | |
return ret | |
} | |
private static func swiftNode(fromXMLNode node: xmlNodePtr) throws -> LibXML2Node { | |
/* See https://www.w3.org/TR/REC-DOM-Level-1/level-one-core.html#ID-1841493061 for a list of node types and their values. */ | |
switch node.pointee.type { | |
case XML_ELEMENT_NODE: | |
/* Element name. Mandatory. */ | |
let namePtr = node.pointee.name | |
guard let name = try namePtr.flatMap({ try stringFromXmlCharPtr($0) }) else {throw Error.elementNodeWithNoName} | |
/* An element node should not have a content. (This is why it is not processed here.) */ | |
/* Attributes. */ | |
var attributes = [LibXML2AttributeNode]() | |
var curAttribute = node.pointee.properties | |
while let attribute = curAttribute { | |
defer {curAttribute = attribute.pointee.next} | |
switch try swiftNode(fromXMLNode: UnsafeMutablePointer<_xmlNode>(OpaquePointer(attribute))) { | |
case .attribute(let swiftAttribute): attributes.append(swiftAttribute) | |
default: throw Error.invalidAttributeNodeInElement | |
} | |
} | |
/* Children nodes. */ | |
var children = [LibXML2Node]() | |
var curChild = node.pointee.children | |
while let child = curChild { | |
children.append(try swiftNode(fromXMLNode: child)) | |
curChild = child.pointee.next | |
} | |
return .element(name: name, attributes: attributes, children: children) | |
case XML_ATTRIBUTE_NODE: | |
/* Element name. Mandatory. */ | |
let namePtr = node.pointee.name | |
guard let name = try namePtr.flatMap({ try stringFromXmlCharPtr($0) }) else {throw Error.attributeNodeWithNoName} | |
/* Element value. Mandatory. In the children attribute, as a text node, for whatever reason... */ | |
guard let valueNode = node.pointee.children else {throw Error.attributeNodeWithNoContent} | |
switch try swiftNode(fromXMLNode: valueNode) { | |
case .text(let text): return .attribute(LibXML2AttributeNode(name: name, value: text)) | |
default: throw Error.invalidContentNodeInElement | |
} | |
case XML_TEXT_NODE: | |
/* node.pointee.name == "text" */ | |
guard let textContent = node.pointee.content else {throw Error.textNodeWithNoContent} | |
return .text(try stringFromXmlCharPtr(textContent)) | |
case XML_CDATA_SECTION_NODE: | |
/* node.pointee.name == "cdata-section" */ | |
guard let dataContent = node.pointee.content else {throw Error.cdataNodeWithNoContent} | |
return .cdata(dataFromXmlCharPtr(dataContent)) | |
default: | |
let namePtr = node.pointee.name | |
let name = try namePtr.flatMap{ try stringFromXmlCharPtr($0) } | |
let contentPtr = node.pointee.name | |
let content = contentPtr.flatMap{ dataFromXmlCharPtr($0) } | |
var children = [LibXML2Node]() | |
var curChild = node.pointee.children | |
while let child = curChild { | |
defer {curChild = child.pointee.next} | |
children.append(try swiftNode(fromXMLNode: child)) | |
} | |
return .other(type: node.pointee.type, name: name, value: content, children: children) | |
} | |
} | |
private static func performXPathQuery(_ query: String, withDocument document: xmlDocPtr) throws -> [LibXML2Node] { | |
/* Create XPath evaluation context */ | |
guard let xpathCtx = xmlXPathNewContext(document) else {throw Error.cannotCreateXPathContext} | |
defer {xmlXPathFreeContext(xpathCtx)} | |
/* Evaluate XPath expression */ | |
guard let xmlCharQuery = query.cString(using: .utf8)?.map({ xmlChar($0) }) else {throw Error.cannotConvertQueryToCString} | |
guard let xpathObj = xmlXPathEvalExpression(xmlCharQuery, xpathCtx) else {throw Error.cannotEvaluateExpression} | |
defer {xmlXPathFreeObject(xpathObj)} | |
/* Note: Is this an error to have an empty Node Set, or can this happen | |
* when the set is empty? */ | |
guard let nodes = xpathObj.pointee.nodesetval else {throw Error.nilNodeSet} | |
var resultNodes = [LibXML2Node]() | |
for i in 0..<Int(nodes.pointee.nodeNr) { | |
guard let nodePtr = nodes.pointee.nodeTab.advanced(by: i).pointee else {continue} | |
resultNodes.append(try swiftNode(fromXMLNode: nodePtr)) | |
} | |
return resultNodes | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment