Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A terrible HTML parser that you might be able use to scrape (well-constructed) pages from Google Apps Script
var Node = function(type) {
this.type = type;
this.tagName = null;
this.attributes = {};
this.children = [];
this.parentElement = null;
this.textContent = "";
};
Node.prototype = {
get innerHTML() {
return this.children.map(function(c) {
return c.serialize()
}).join("");
},
// simple selectors only - tags, classes, IDs, and attributes, no descendants
querySelectorAll: function(selector) {
var results = [];
var subSelectors = selector.split(",").map(function(s) { return s.trim() });
var patterns = subSelectors.map(function(s) {
// get tag at start
var tag = s.match(/^[a-z]+/g);
var classes = s.match(/\.([\w\-]+)/g);
var id = s.match(/#([\w\-]+)/g);
var attrs = s.match(/\[[^\]]+?\]/g);
return function(node) {
if (tag && node.tagName != tag[0]) return;
if (classes) {
if (!node.attributes.class) return;
for (var i = 0; i < classes.length; i++) {
var c = new RegExp("(^| )" + classes[i].slice(1) + "( |$)", "gi");
if (!node.attributes.class.match(c)) return;
}
}
if (id) {
if (node.attributes.id != id[0].slice(1)) return;
}
if (attrs) {
// todo
}
// passed the test
results.push(node);
}
});
var test = function(node) {
return patterns.some(function(p) { return p(node) });
};
var walk = function(node) {
node.children.forEach(function(n) {
test(n);
walk(n);
});
};
walk(this);
return results;
},
appendChild: function(element) {
this.children.push(element);
element.parentElement = this;
},
serialize: function(depth) {
depth = depth || 0;
var indent = "\n" + new Array(depth + 1).join(" ");
if (this.type == "text") return indent + this.textContent;
var output = "<" + this.tagName + " ";
var self = this;
output += Object.keys(this.attributes).map(function(n) {
return n + '="' + self.attributes[n] + '"';
}).join(" ");
output = indent + output.trim() + ">";
if (this.textContent) output += indent + " " + this.textContent;
output += this.children.map(function(c) {
return c.serialize(depth + 2);
}).join("");
output += (this.textContent || this.children.length ? indent : "") + "</" + this.tagName + ">";
return output;
}
};
var selfClosing = "meta link img".split(" ");
var nullContents = "script".split(" ");
var closedBy = {
li: ["li", "ul"],
th: ["td", "tr", "tbody", "thead", "table"],
td: ["td", "tr", "tbody", "thead", "table"],
tr: ["tr", "tbody", "thead", "table"],
tbody: ["tbody", "thead", "table"],
thead: ["tbody", "thead", "table"]
};
var HTMLParser = function() {
};
HTMLParser.prototype = {
parse: function(src) {
src = src.replace(/<![^>]+>/g, "").trim();
var state = "reading";
var document = new Node();
var branch = document;
for (var c = 0; c < src.length; c++) {
var char = src[c];
var next = src[c+1];
switch (char) {
case "<":
var quoted = false;
var quote = null;
var i = c + 1;
var buffer = "";
var chunks = [];
while (i < src.length) {
var o = src[i];
if (!quoted && (o == '"' || o == "'")) {
quoted = true;
quote = o;
} else if (quoted && o == quote) {
quoted = false;
quote = null;
}
if (o == ">" && !quoted) break;
if (o.match(/[\s\n]/) && !quoted) {
chunks.push(buffer);
buffer = "";
i++;
} else {
buffer += src[i++];
}
}
if (buffer) chunks.push(buffer);
if (i == src.length) throw "Ran out of document, starting from position " + c;
var tagName = chunks.shift();
if (next == "/") {
tagName = tagName.slice(1);
// walk up to find a closing tag
var opening = branch;
while (opening && opening.tagName != tagName) {
opening = opening.parentElement;
}
if (opening) {
branch = opening.parentElement;
}
} else {
var child = new Node("html");
child.tagName = tagName;
chunks.forEach(function(c) {
var parts = c.split("=");
var name = parts[0];
if (name == "/") return;
var value = parts[1] || "";
value = value.replace(/^"|"$/g, "").trim();
child.attributes[name] = value;
});
// is this a tag that is closed implicitly by siblings? Walk up the tree if so.
while (closedBy[branch.tagName] && closedBy[branch.tagName].indexOf(child.tagName) > -1) {
branch = branch.parentElement;
}
branch.appendChild(child);
// handle tags that do not contain HTML (scripts)
if (nullContents.indexOf(child.tagName) > -1) {
var searchValue = "</" + child.tagName + ">";
var start = i + 1;
var end = src.indexOf(searchValue, i);
var contents = src.slice(start, end);
child.textContent = contents;
i = end + searchValue.length;
} else if (selfClosing.indexOf(child.tagName) == -1) {
branch = child;
}
}
c = i;
break;
default:
var next = src.indexOf("<", c);
if (next == -1 || next == c) continue;
var contents = src.slice(c, next).trim();
if (!contents.length) continue;
var text = new Node("text");
text.textContent = contents;
branch.appendChild(text);
c = next - 1;
}
}
return document;
}
};
if (typeof process != "undefined" && process.env) {
var fs = require("fs");
var html = fs.readFileSync("./ratings.html", "utf-8");
var parser = new HTMLParser();
var tree = parser.parse(html);
console.log(tree.serialize());
var modals = tree.querySelectorAll(".solid-seats-modal");
var ratings = [].concat.apply([], Array.from(modals).map(function(m) {
var rating = m.querySelectorAll(".solid-seats-modal-in-title")[0].innerHTML.trim();
var rows = m.querySelectorAll(".popup-table-data-row");
rows = Array.from(rows).slice(1);
return Array.from(rows).map(function(row) {
var [race, name, lean] = Array.from(row.querySelectorAll(".popup-table-data-cell"))
.map(function(c) { return c.innerHTML.trim() });
var [state, district] = race.split("-");
return [ state, district, name, lean, rating ];
});
}));
// console.log("state,district,name,lean,rating\n" + ratings.map(function(r) { return r.join() }).join("\n"));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.