Created
May 28, 2017 01:35
-
-
Save Frizlab/54c17bbb9921a82fb854697dbf48719a to your computer and use it in GitHub Desktop.
Safe XPath Queries in Swift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* XPathQuery.swift | |
* test_lbp | |
* | |
* Originally created by Matt Gallagher on 4/08/08. | |
* Heavily modified (conversion to swift, better libxml2 nodes to object | |
* conversion, etc.) by François Lamboley. | |
* | |
* Created by François Lamboley on 16/04/2017. | |
* Copyright © 2017 François Lamboley. All rights reserved. | |
*/ | |
import Foundation | |
/* Note: To be able to "import libxml2" just like that, create a | |
* module.modulemap file containing the following: | |
* module libxml2 [system] { | |
* header "/usr/include/libxml2/libxml/tree.h" | |
* header "/usr/include/libxml2/libxml/parser.h" | |
* header "/usr/include/libxml2/libxml/HTMLparser.h" | |
* header "/usr/include/libxml2/libxml/xpath.h" | |
* header "/usr/include/libxml2/libxml/xpathInternals.h" | |
* export * | |
* } | |
*/ | |
import libxml2 | |
struct XPathQueryUtils { | |
enum Error : Swift.Error { | |
case cannotCreateXMLDoc | |
case cannotCreateXPathContext | |
case cannotConvertQueryToCString | |
case cannotEvaluateExpression | |
case nilNodeSet | |
case textNodeWithNoContent | |
case cdataNodeWithNoContent | |
case elementNodeWithNoName | |
case invalidAttributeNodeInElement | |
case attributeNodeWithNoName | |
case attributeNodeWithNoContent | |
case invalidContentNodeInElement | |
case invalidUTF8 | |
} | |
enum LibXML2Node { | |
case element(name: String, attributes: [LibXML2AttributeNode], children: [LibXML2Node]) | |
case attribute(LibXML2AttributeNode) | |
case text(String) /* libxml2 is written in a way which makes even text nodes susceptible to have children nodes. We assume they won't. */ | |
case cdata(Data) /* libxml2 is written in a way which makes even CData nodes susceptible to have children nodes. We assume they won't. */ | |
case other(type: xmlElementType, name: String?, value: Data?, children: [LibXML2Node]) /* We only support conversion for above types. We could add more if needed. Also, we assume other types don't have attributes (they currently don't at least). */ | |
} | |
struct LibXML2AttributeNode { | |
var name: String | |
var value: String | |
} | |
/** Takes an array of LibXML2Nodes and outputs a string. Only .text and | |
.cdata nodes are considered. */ | |
static func textFrom(nodeList: [LibXML2Node]) throws -> String { | |
var res = String() | |
for n in nodeList { | |
switch n { | |
case .text(let str): | |
res += str | |
case .cdata(let data): | |
guard let str = String(data: data, encoding: .utf8) else {throw Error.invalidUTF8} | |
res += str | |
default: (/*nop(ignored)*/) | |
} | |
} | |
return res | |
} | |
static func dictionaryFrom(attributeNodesList: [LibXML2AttributeNode]) -> [String: String] { | |
var res = [String: String]() | |
for a in attributeNodesList {res[a.name] = a.value} | |
return res | |
} | |
static func performXMLXPathQuery(_ query: String, withDocument doc: Data) throws -> [LibXML2Node] { | |
return try doc.withUnsafeBytes { (bytes: UnsafePointer<Int8>) -> [LibXML2Node] in | |
guard let doc = xmlReadMemory(bytes, Int32(doc.count), "", nil, Int32(XML_PARSE_RECOVER.rawValue)) else {throw Error.cannotCreateXMLDoc} | |
defer {xmlFreeDoc(doc)} | |
return try performXPathQuery(query, withDocument: doc) | |
} | |
} | |
static func performHTMLXPathQuery(_ query: String, withDocument doc: Data) throws -> [LibXML2Node] { | |
return try doc.withUnsafeBytes { (bytes: UnsafePointer<Int8>) -> [LibXML2Node] in | |
guard let doc = htmlReadMemory(bytes, Int32(doc.count), "", nil, Int32(HTML_PARSE_NOWARNING.rawValue | HTML_PARSE_NOERROR.rawValue)) else {throw Error.cannotCreateXMLDoc} | |
defer {xmlFreeDoc(doc)} | |
return try performXPathQuery(query, withDocument: doc) | |
} | |
} | |
/* *************** | |
MARK: - Private | |
*************** */ | |
private static func dataFromXmlCharPtr(_ ptr: UnsafePointer<xmlChar>) -> Data { | |
/* ptr is a pointer to xmlChar (aka. UInt8). | |
* We convert it to an opaque pointer to retrieve un unsafe pointer raw | |
* pointer. | |
* | |
* Note: We can use strlen because 0x00 is an invalid byte in an XML doc. */ | |
let rawPtr = UnsafeRawPointer(OpaquePointer(ptr)) | |
return Data(bytes: rawPtr, count: Int(strlen(rawPtr.assumingMemoryBound(to: Int8.self)))) | |
} | |
private static func stringFromXmlCharPtr(_ ptr: UnsafePointer<xmlChar>) throws -> String { | |
/* ptr is a pointer to xmlChar (aka. UInt8). | |
* We convert it to an opaque pointer to retrieve un unsafe pointer to | |
* CChar (aka. Int8). */ | |
guard let ret = String(cString: UnsafePointer<CChar>(OpaquePointer(ptr)), encoding: .utf8) else {throw Error.invalidUTF8} | |
return ret | |
} | |
private static func swiftNode(fromXMLNode node: xmlNodePtr) throws -> LibXML2Node { | |
/* See https://www.w3.org/TR/REC-DOM-Level-1/level-one-core.html#ID-1841493061 for a list of node types and their values. */ | |
switch node.pointee.type { | |
case XML_ELEMENT_NODE: | |
/* Element name. Mandatory. */ | |
let namePtr = node.pointee.name | |
guard let name = try namePtr.flatMap({ try stringFromXmlCharPtr($0) }) else {throw Error.elementNodeWithNoName} | |
/* An element node should not have a content. (This is why it is not processed here.) */ | |
/* Attributes. */ | |
var attributes = [LibXML2AttributeNode]() | |
var curAttribute = node.pointee.properties | |
while let attribute = curAttribute { | |
defer {curAttribute = attribute.pointee.next} | |
switch try swiftNode(fromXMLNode: UnsafeMutablePointer<_xmlNode>(OpaquePointer(attribute))) { | |
case .attribute(let swiftAttribute): attributes.append(swiftAttribute) | |
default: throw Error.invalidAttributeNodeInElement | |
} | |
} | |
/* Children nodes. */ | |
var children = [LibXML2Node]() | |
var curChild = node.pointee.children | |
while let child = curChild { | |
children.append(try swiftNode(fromXMLNode: child)) | |
curChild = child.pointee.next | |
} | |
return .element(name: name, attributes: attributes, children: children) | |
case XML_ATTRIBUTE_NODE: | |
/* Element name. Mandatory. */ | |
let namePtr = node.pointee.name | |
guard let name = try namePtr.flatMap({ try stringFromXmlCharPtr($0) }) else {throw Error.attributeNodeWithNoName} | |
/* Element value. Mandatory. In the children attribute, as a text node, for whatever reason... */ | |
guard let valueNode = node.pointee.children else {throw Error.attributeNodeWithNoContent} | |
switch try swiftNode(fromXMLNode: valueNode) { | |
case .text(let text): return .attribute(LibXML2AttributeNode(name: name, value: text)) | |
default: throw Error.invalidContentNodeInElement | |
} | |
case XML_TEXT_NODE: | |
/* node.pointee.name == "text" */ | |
guard let textContent = node.pointee.content else {throw Error.textNodeWithNoContent} | |
return .text(try stringFromXmlCharPtr(textContent)) | |
case XML_CDATA_SECTION_NODE: | |
/* node.pointee.name == "cdata-section" */ | |
guard let dataContent = node.pointee.content else {throw Error.cdataNodeWithNoContent} | |
return .cdata(dataFromXmlCharPtr(dataContent)) | |
default: | |
let namePtr = node.pointee.name | |
let name = try namePtr.flatMap{ try stringFromXmlCharPtr($0) } | |
let contentPtr = node.pointee.name | |
let content = contentPtr.flatMap{ dataFromXmlCharPtr($0) } | |
var children = [LibXML2Node]() | |
var curChild = node.pointee.children | |
while let child = curChild { | |
defer {curChild = child.pointee.next} | |
children.append(try swiftNode(fromXMLNode: child)) | |
} | |
return .other(type: node.pointee.type, name: name, value: content, children: children) | |
} | |
} | |
private static func performXPathQuery(_ query: String, withDocument document: xmlDocPtr) throws -> [LibXML2Node] { | |
/* Create XPath evaluation context */ | |
guard let xpathCtx = xmlXPathNewContext(document) else {throw Error.cannotCreateXPathContext} | |
defer {xmlXPathFreeContext(xpathCtx)} | |
/* Evaluate XPath expression */ | |
guard let xmlCharQuery = query.cString(using: .utf8)?.map({ xmlChar($0) }) else {throw Error.cannotConvertQueryToCString} | |
guard let xpathObj = xmlXPathEvalExpression(xmlCharQuery, xpathCtx) else {throw Error.cannotEvaluateExpression} | |
defer {xmlXPathFreeObject(xpathObj)} | |
/* Note: Is this an error to have an empty Node Set, or can this happen | |
* when the set is empty? */ | |
guard let nodes = xpathObj.pointee.nodesetval else {throw Error.nilNodeSet} | |
var resultNodes = [LibXML2Node]() | |
for i in 0..<Int(nodes.pointee.nodeNr) { | |
guard let nodePtr = nodes.pointee.nodeTab.advanced(by: i).pointee else {continue} | |
resultNodes.append(try swiftNode(fromXMLNode: nodePtr)) | |
} | |
return resultNodes | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment