Skip to content

Instantly share code, notes, and snippets.

@orellabac
Created November 26, 2023 02:36
Show Gist options
  • Save orellabac/be8b9c54c3aa5f5c6fcfb915179cbfc5 to your computer and use it in GitHub Desktop.
Save orellabac/be8b9c54c3aa5f5c6fcfb915179cbfc5 to your computer and use it in GitHub Desktop.
Snowpark Java UTDF to apply an XPATH to an XML and return the matches as rows
/*
XPATHGET_TO_TABLE Function
Description:
The XPATHGET_TO_TABLE function is designed to extract and return XML nodes as a table based on the provided XPath expression. It takes as input an XML string, an XPath expression, and an object specifying namespace prefixes.
Parameters:
xml (String): The XML string from which the function extracts nodes.
xpath (String): The XPath expression used to navigate and locate nodes in the XML document.
prefixes (Object): An object specifying namespace prefixes for the XML document.
Return Value:
The function returns a table where each row represents a distinct node selected by the XPath expression. The column node contains the value of each selected node.
Example Usage:
sql
Copy code
SELECT * FROM TABLE(XPATHGET_TO_TABLE('<bookstore><book><title>Introduction to SQL</title></book><book><title>Advanced SQL</title></book></bookstore>', '//book/title', object_construct()));
-- Output:
--<title>Introduction to SQL</title>
--<title>Advanced SQL</title>
Note:
Ensure that the XML string is well-formed, and the XPath expression accurately reflects the structure of the XML document for proper extraction. The prefixes parameter allows handling namespaces in the XPath expression.
*/
create or replace function XPATHGET_TO_TABLE(xml String, xpath String, prefixes object)
returns table(node varchar)
language java
handler='XmlExtractor'
target_path='@MYSTAGE/XPATHGET_TO_TABLE.jar'
as
$$
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import java.util.Map;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
class OutputRow {
public String node;
public OutputRow(String node) { this.node = node;}
}
class CustomNamespaceContext implements NamespaceContext {
private final Map<String, String> namespaceMap;
public CustomNamespaceContext(Map<String, String> namespaceMap) { this.namespaceMap = namespaceMap; }
@Override
public String getNamespaceURI(String prefix)
{ return (namespaceMap!=null)?namespaceMap.get(prefix):null; }
@Override
public String getPrefix(String namespaceURI) { return null; }
@Override
public Iterator<String> getPrefixes(String namespaceURI) { return null; }
}
class XmlProcessor
{
static TransformerFactory TRANSFORMERFACTORY = TransformerFactory.newInstance();
static Transformer TRANSFORMER = null;
static {
try {
TRANSFORMER = TRANSFORMERFACTORY.newTransformer();
TRANSFORMER.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
}
catch (Exception ex) {}
}
public static String elementToString(Node element) {
try {
DOMSource source = new DOMSource(element);
StreamResult result = new StreamResult(new StringWriter());
TRANSFORMER.transform(source, result);
return result.getWriter().toString();
} catch (Exception e)
{ return null; }
}
private static class NodeListIterator implements Iterator<OutputRow> {
private final NodeList nodeList;
private int currentIndex;
NodeListIterator(NodeList nodeList) { this.nodeList = nodeList;this.currentIndex = 0;}
@Override
public boolean hasNext() { return currentIndex < nodeList.getLength(); }
@Override
public OutputRow next()
{
var xml = elementToString(nodeList.item(currentIndex++));
return new OutputRow(xml);
}
}
public static NodeList extractTags(Map<String, String> namespaceMap, String xmlString, String xpathExpression) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(new ByteArrayInputStream(xmlString.getBytes()));
//Get XPath expression
XPathFactory xpathfactory = XPathFactory.newInstance();
XPath xpath = xpathfactory.newXPath();
xpath.setNamespaceContext(new CustomNamespaceContext(namespaceMap));
XPathExpression expr = xpath.compile(xpathExpression);
//Search XPath expression
Object result = expr.evaluate(doc, XPathConstants.NODESET);
//Iterate over results and fetch book names
NodeList nodes = (NodeList) result;
return nodes;
}
public static Stream<OutputRow> processXmlData(Map<String, String> prefixToNamespaceMap, String xmlString, String xpathExpression) {
try {
NodeList elements = extractTags(prefixToNamespaceMap, xmlString, xpathExpression);
Iterator<OutputRow> elementIterator = new NodeListIterator(elements);
return StreamSupport.stream(
Spliterators.spliteratorUnknownSize(elementIterator, Spliterator.ORDERED),
false
);
} catch (Exception e)
{ return Stream.empty();}
}
}
public class XmlExtractor {
public static Class getOutputClass() { return OutputRow.class; }
public Stream<OutputRow> process(String xml, String xpath,Map<String,String> prefixes)
{ return XmlProcessor.processXmlData(prefixes, xml, xpath); }
public Stream<OutputRow> endPartition() { return Stream.empty(); }
}
$$;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment