Last active
June 20, 2019 14:44
-
-
Save thomaswilburn/0be95ed78de2fe98a3bac74347715743 to your computer and use it in GitHub Desktop.
A terrible HTML parser that you might be able use to scrape (well-constructed) pages from Google Apps Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var Node = function(type) { | |
this.type = type; | |
this.tagName = null; | |
this.attributes = {}; | |
this.children = []; | |
this.parentElement = null; | |
this.textContent = ""; | |
}; | |
Node.prototype = { | |
get innerHTML() { | |
return this.children.map(function(c) { | |
return c.serialize() | |
}).join(""); | |
}, | |
// simple selectors only - tags, classes, IDs, and attributes, no descendants | |
querySelectorAll: function(selector) { | |
var results = []; | |
var subSelectors = selector.split(",").map(function(s) { return s.trim() }); | |
var patterns = subSelectors.map(function(s) { | |
// get tag at start | |
var tag = s.match(/^[a-z]+/g); | |
var classes = s.match(/\.([\w\-]+)/g); | |
var id = s.match(/#([\w\-]+)/g); | |
var attrs = s.match(/\[[^\]]+?\]/g); | |
return function(node) { | |
if (tag && node.tagName != tag[0]) return; | |
if (classes) { | |
if (!node.attributes.class) return; | |
for (var i = 0; i < classes.length; i++) { | |
var c = new RegExp("(^| )" + classes[i].slice(1) + "( |$)", "gi"); | |
if (!node.attributes.class.match(c)) return; | |
} | |
} | |
if (id) { | |
if (node.attributes.id != id[0].slice(1)) return; | |
} | |
if (attrs) { | |
// todo | |
} | |
// passed the test | |
results.push(node); | |
} | |
}); | |
var test = function(node) { | |
return patterns.some(function(p) { return p(node) }); | |
}; | |
var walk = function(node) { | |
node.children.forEach(function(n) { | |
test(n); | |
walk(n); | |
}); | |
}; | |
walk(this); | |
return results; | |
}, | |
appendChild: function(element) { | |
this.children.push(element); | |
element.parentElement = this; | |
}, | |
serialize: function(depth) { | |
depth = depth || 0; | |
var indent = "\n" + new Array(depth + 1).join(" "); | |
if (this.type == "text") return indent + this.textContent; | |
var output = "<" + this.tagName + " "; | |
var self = this; | |
output += Object.keys(this.attributes).map(function(n) { | |
return n + '="' + self.attributes[n] + '"'; | |
}).join(" "); | |
output = indent + output.trim() + ">"; | |
if (this.textContent) output += indent + " " + this.textContent; | |
output += this.children.map(function(c) { | |
return c.serialize(depth + 2); | |
}).join(""); | |
output += (this.textContent || this.children.length ? indent : "") + "</" + this.tagName + ">"; | |
return output; | |
} | |
}; | |
var selfClosing = "meta link img".split(" "); | |
var nullContents = "script".split(" "); | |
var closedBy = { | |
li: ["li", "ul"], | |
th: ["td", "tr", "tbody", "thead", "table"], | |
td: ["td", "tr", "tbody", "thead", "table"], | |
tr: ["tr", "tbody", "thead", "table"], | |
tbody: ["tbody", "thead", "table"], | |
thead: ["tbody", "thead", "table"] | |
}; | |
var HTMLParser = function() { | |
}; | |
HTMLParser.prototype = { | |
parse: function(src) { | |
src = src.replace(/<![^>]+>/g, "").trim(); | |
var state = "reading"; | |
var document = new Node(); | |
var branch = document; | |
for (var c = 0; c < src.length; c++) { | |
var char = src[c]; | |
var next = src[c+1]; | |
switch (char) { | |
case "<": | |
var quoted = false; | |
var quote = null; | |
var i = c + 1; | |
var buffer = ""; | |
var chunks = []; | |
while (i < src.length) { | |
var o = src[i]; | |
if (!quoted && (o == '"' || o == "'")) { | |
quoted = true; | |
quote = o; | |
} else if (quoted && o == quote) { | |
quoted = false; | |
quote = null; | |
} | |
if (o == ">" && !quoted) break; | |
if (o.match(/[\s\n]/) && !quoted) { | |
chunks.push(buffer); | |
buffer = ""; | |
i++; | |
} else { | |
buffer += src[i++]; | |
} | |
} | |
if (buffer) chunks.push(buffer); | |
if (i == src.length) throw "Ran out of document, starting from position " + c; | |
var tagName = chunks.shift(); | |
if (next == "/") { | |
tagName = tagName.slice(1); | |
// walk up to find a closing tag | |
var opening = branch; | |
while (opening && opening.tagName != tagName) { | |
opening = opening.parentElement; | |
} | |
if (opening) { | |
branch = opening.parentElement; | |
} | |
} else { | |
var child = new Node("html"); | |
child.tagName = tagName; | |
chunks.forEach(function(c) { | |
var parts = c.split("="); | |
var name = parts[0]; | |
if (name == "/") return; | |
var value = parts[1] || ""; | |
value = value.replace(/^"|"$/g, "").trim(); | |
child.attributes[name] = value; | |
}); | |
// is this a tag that is closed implicitly by siblings? Walk up the tree if so. | |
while (closedBy[branch.tagName] && closedBy[branch.tagName].indexOf(child.tagName) > -1) { | |
branch = branch.parentElement; | |
} | |
branch.appendChild(child); | |
// handle tags that do not contain HTML (scripts) | |
if (nullContents.indexOf(child.tagName) > -1) { | |
var searchValue = "</" + child.tagName + ">"; | |
var start = i + 1; | |
var end = src.indexOf(searchValue, i); | |
var contents = src.slice(start, end); | |
child.textContent = contents; | |
i = end + searchValue.length; | |
} else if (selfClosing.indexOf(child.tagName) == -1) { | |
branch = child; | |
} | |
} | |
c = i; | |
break; | |
default: | |
var next = src.indexOf("<", c); | |
if (next == -1 || next == c) continue; | |
var contents = src.slice(c, next).trim(); | |
if (!contents.length) continue; | |
var text = new Node("text"); | |
text.textContent = contents; | |
branch.appendChild(text); | |
c = next - 1; | |
} | |
} | |
return document; | |
} | |
}; | |
if (typeof process != "undefined" && process.env) { | |
var fs = require("fs"); | |
var html = fs.readFileSync("./ratings.html", "utf-8"); | |
var parser = new HTMLParser(); | |
var tree = parser.parse(html); | |
console.log(tree.serialize()); | |
var modals = tree.querySelectorAll(".solid-seats-modal"); | |
var ratings = [].concat.apply([], Array.from(modals).map(function(m) { | |
var rating = m.querySelectorAll(".solid-seats-modal-in-title")[0].innerHTML.trim(); | |
var rows = m.querySelectorAll(".popup-table-data-row"); | |
rows = Array.from(rows).slice(1); | |
return Array.from(rows).map(function(row) { | |
var [race, name, lean] = Array.from(row.querySelectorAll(".popup-table-data-cell")) | |
.map(function(c) { return c.innerHTML.trim() }); | |
var [state, district] = race.split("-"); | |
return [ state, district, name, lean, rating ]; | |
}); | |
})); | |
// console.log("state,district,name,lean,rating\n" + ratings.map(function(r) { return r.join() }).join("\n")); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment