Created
November 26, 2023 02:36
-
-
Save orellabac/be8b9c54c3aa5f5c6fcfb915179cbfc5 to your computer and use it in GitHub Desktop.
Snowpark Java UTDF to apply an XPATH to an XML and return the matches as rows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
XPATHGET_TO_TABLE Function | |
Description: | |
The XPATHGET_TO_TABLE function is designed to extract and return XML nodes as a table based on the provided XPath expression. It takes as input an XML string, an XPath expression, and an object specifying namespace prefixes. | |
Parameters: | |
xml (String): The XML string from which the function extracts nodes. | |
xpath (String): The XPath expression used to navigate and locate nodes in the XML document. | |
prefixes (Object): An object specifying namespace prefixes for the XML document. | |
Return Value: | |
The function returns a table where each row represents a distinct node selected by the XPath expression. The column node contains the value of each selected node. | |
Example Usage: | |
sql | |
Copy code | |
SELECT * FROM TABLE(XPATHGET_TO_TABLE('<bookstore><book><title>Introduction to SQL</title></book><book><title>Advanced SQL</title></book></bookstore>', '//book/title', object_construct())); | |
-- Output: | |
--<title>Introduction to SQL</title> | |
--<title>Advanced SQL</title> | |
Note: | |
Ensure that the XML string is well-formed, and the XPath expression accurately reflects the structure of the XML document for proper extraction. The prefixes parameter allows handling namespaces in the XPath expression. | |
*/ | |
create or replace function XPATHGET_TO_TABLE(xml String, xpath String, prefixes object) | |
returns table(node varchar) | |
language java | |
handler='XmlExtractor' | |
target_path='@MYSTAGE/XPATHGET_TO_TABLE.jar' | |
as | |
$$ | |
import java.io.ByteArrayInputStream; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.StringWriter; | |
import java.util.ArrayList; | |
import java.util.Iterator; | |
import javax.xml.XMLConstants; | |
import javax.xml.namespace.NamespaceContext; | |
import javax.xml.parsers.DocumentBuilder; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import javax.xml.transform.OutputKeys; | |
import javax.xml.transform.Transformer; | |
import javax.xml.transform.TransformerFactory; | |
import javax.xml.transform.dom.DOMSource; | |
import javax.xml.transform.stream.StreamResult; | |
import javax.xml.xpath.XPath; | |
import javax.xml.xpath.XPathConstants; | |
import javax.xml.xpath.XPathExpression; | |
import javax.xml.xpath.XPathFactory; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.NodeList; | |
import org.w3c.dom.Element; | |
import org.w3c.dom.Node; | |
import java.util.Map; | |
import java.util.Spliterator; | |
import java.util.Spliterators; | |
import java.util.stream.Stream; | |
import java.util.stream.StreamSupport; | |
class OutputRow { | |
public String node; | |
public OutputRow(String node) { this.node = node;} | |
} | |
class CustomNamespaceContext implements NamespaceContext { | |
private final Map<String, String> namespaceMap; | |
public CustomNamespaceContext(Map<String, String> namespaceMap) { this.namespaceMap = namespaceMap; } | |
@Override | |
public String getNamespaceURI(String prefix) | |
{ return (namespaceMap!=null)?namespaceMap.get(prefix):null; } | |
@Override | |
public String getPrefix(String namespaceURI) { return null; } | |
@Override | |
public Iterator<String> getPrefixes(String namespaceURI) { return null; } | |
} | |
class XmlProcessor | |
{ | |
static TransformerFactory TRANSFORMERFACTORY = TransformerFactory.newInstance(); | |
static Transformer TRANSFORMER = null; | |
static { | |
try { | |
TRANSFORMER = TRANSFORMERFACTORY.newTransformer(); | |
TRANSFORMER.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); | |
} | |
catch (Exception ex) {} | |
} | |
public static String elementToString(Node element) { | |
try { | |
DOMSource source = new DOMSource(element); | |
StreamResult result = new StreamResult(new StringWriter()); | |
TRANSFORMER.transform(source, result); | |
return result.getWriter().toString(); | |
} catch (Exception e) | |
{ return null; } | |
} | |
private static class NodeListIterator implements Iterator<OutputRow> { | |
private final NodeList nodeList; | |
private int currentIndex; | |
NodeListIterator(NodeList nodeList) { this.nodeList = nodeList;this.currentIndex = 0;} | |
@Override | |
public boolean hasNext() { return currentIndex < nodeList.getLength(); } | |
@Override | |
public OutputRow next() | |
{ | |
var xml = elementToString(nodeList.item(currentIndex++)); | |
return new OutputRow(xml); | |
} | |
} | |
public static NodeList extractTags(Map<String, String> namespaceMap, String xmlString, String xpathExpression) throws Exception { | |
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
factory.setNamespaceAware(true); | |
DocumentBuilder builder = factory.newDocumentBuilder(); | |
Document doc = builder.parse(new ByteArrayInputStream(xmlString.getBytes())); | |
//Get XPath expression | |
XPathFactory xpathfactory = XPathFactory.newInstance(); | |
XPath xpath = xpathfactory.newXPath(); | |
xpath.setNamespaceContext(new CustomNamespaceContext(namespaceMap)); | |
XPathExpression expr = xpath.compile(xpathExpression); | |
//Search XPath expression | |
Object result = expr.evaluate(doc, XPathConstants.NODESET); | |
//Iterate over results and fetch book names | |
NodeList nodes = (NodeList) result; | |
return nodes; | |
} | |
public static Stream<OutputRow> processXmlData(Map<String, String> prefixToNamespaceMap, String xmlString, String xpathExpression) { | |
try { | |
NodeList elements = extractTags(prefixToNamespaceMap, xmlString, xpathExpression); | |
Iterator<OutputRow> elementIterator = new NodeListIterator(elements); | |
return StreamSupport.stream( | |
Spliterators.spliteratorUnknownSize(elementIterator, Spliterator.ORDERED), | |
false | |
); | |
} catch (Exception e) | |
{ return Stream.empty();} | |
} | |
} | |
public class XmlExtractor { | |
public static Class getOutputClass() { return OutputRow.class; } | |
public Stream<OutputRow> process(String xml, String xpath,Map<String,String> prefixes) | |
{ return XmlProcessor.processXmlData(prefixes, xml, xpath); } | |
public Stream<OutputRow> endPartition() { return Stream.empty(); } | |
} | |
$$; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment