Skip to content

Instantly share code, notes, and snippets.

@KenDUemura
Last active October 19, 2020 03:39
Show Gist options
  • Save KenDUemura/983766f3f276b984f40e10a311eb72dc to your computer and use it in GitHub Desktop.
Save KenDUemura/983766f3f276b984f40e10a311eb72dc to your computer and use it in GitHub Desktop.
Google Apps Script for parse table element from HTML string parsed with XmlService and returns 2D array
/*
PARAMS:
text XML (In this gist, namespace needs to be 'http://www.w3.org/1999/xhtml')
path XPATH (So far tested with simple indexed lookup /html/body/table[3])
RETURNS:
Array[][] (Table data)
Missing th/thead/tfoot support
*/
function getDataFromXpath (text, path) {
var xmlDoc = XmlService.parse(text)
Logger.log("INFO: xmlDoc \n" + xmlDoc)
// html will be the RootElement
path = path.replace("/html/","")
var tags = path.split("/");
Logger.log("tags : " + tags);
var element = xmlDoc.getRootElement();
var namespace = XmlService.getNamespace('http://www.w3.org/1999/xhtml');
for(var i in tags) {
var tag = tags[i];
Logger.log("Tag : " + tag);
var index = tag.indexOf("[");
if(index != -1) {
var val = parseInt(tag.match(/\[(\d+)\]/)[1]);
tag = tag.substring(0,index);
element = element.getChildren(tag, namespace)[val-1];
} else {
element = element.getChild(tag, namespace)
}
Logger.log(element);
}
var data = [];
if (tags[tags.length - 1].match('table')) {
Logger.log("Parsing Table")
// TODO: thead
// tbody
var tbody = element.getChild("tbody", namespace);
if (tbody) {
element = tbody;
}
var rows = element.getChildren("tr", namespace);
for (var i in rows) {
var row = [];
var cols = rows[i].getChildren("td", namespace);
for (var j in cols) {
var cell = cols[j].getValue();
row.push(cell)
}
}
// TODO: tfoot
} else {
Logger.log("Unsupported tag type: " + tags[tags.length - 1])
}
return data;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment