Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Scraping Denver Legislation with CasperJS
var casper = require("casper").create();
var x = require('casper').selectXPath;
var fs = require("fs");
// Trying to scrape everything doesn't seem to work; eventually the page
// just goes blank. It makes sense to scrape by year anway--that way
// each year can go in a separate JSON file.
var years = [2010, 2011, 2013, 2014];
var ids = [];
var items = [];
// Scrape the item IDs on the paginated year searches, until
// there are no more.
function scrapeIds() {
if (casper.exists("tr[onclick]")) {
var itemRows = this.getElementsInfo("tr[onclick]");
for (var i = 0; i < itemRows.length; ++i) {
ids.push(itemRows[i].attributes.onclick.match(/\d+/));
}
if (casper.exists(x("//*[text()='Next >']"))) {
casper.click(x("//*[text()='Next >']"));
casper.then(scrapeIds);
}
}
}
// Obviously there's more info to be scraped, but this is a start
function scrapeItems() {
var id = ids.pop();
casper.open("http://www.denvergov.org/sirepub/item.aspx?itemid=" + id);
casper.then(function() {
if (casper.exists("#ItemHistory_lblTitle")) {
var version = 1;
// There are multiple versions of some items. If we're looking at
// the first one, add the rest to the queue.
if (casper.exists("#ItemHistory_tblHistory tr.currenthistoryitem")) {
version = +casper.getElementInfo("#ItemHistory_tblHistory tr.currenthistoryitem td:nth-child(2)").html.match(/\d+/);
if (version === 1 && casper.exists("a[title='Open this version']")) {
var links = this.getElementsAttribute("a[title='Open this version']", "href");
for (var i = 0; i < links.length; ++i) {
ids.push(links[i].match(/\d+/));
}
}
}
var title = casper.getElementInfo("#ItemHistory_lblTitle").text;
var num = casper.getElementInfo("#ItemHistory_lblTrackingNumber").text;
var type = casper.getElementInfo("#ItemHistory_lblType").text;
var date = casper.getElementInfo("#ItemHistory_lblMeetingDate > a").text;
var result = "unknown";
if (casper.exists("span.motionpassed")) {
result = "passed";
} else if (casper.exists("span.motionfailed")) {
result = "failed";
}
// Some items don't have (recorded) votes, so this could be null
var votes = casper.evaluate(function() {
var votes = null;
if (document.querySelector(".attendee")) {
var rows = document.querySelectorAll("#ItemHistory_tblVoting table table tr");
votes = [];
for (var i = 0; i < rows.length; ++i) {
votes.push({
"member": rows[i].querySelector(".attendee").innerText,
"vote": rows[i].querySelector(".votefor").innerText
});
}
}
return votes;
});
items.push({
"title": title,
"num": num,
"type": type,
"date": date,
"version": version,
"result": result,
"votes": votes
});
} else {
// Sometimes we get an error
console.log("Failed: " + id);
}
if (ids.length > 0) {
scrapeItems();
}
});
}
function scrapeYear() {
var year = years.pop();
this.capture("denver-" + year + ".png");
this.fill("form#form1", {"dd_meet_date": year});
this.click("#Itemsmplsrch1_btnSearch");
// Upon clicking, the button will say "Searching...", then it'll
// revert back to "Search" when the results are ready
casper.waitFor(function check() {
return this.evaluate(function() {
return document.getElementById("Itemsmplsrch1_btnSearch").value === "Search";
});
}, scrapeIds);
casper.then(scrapeItems);
casper.then(function() {
fs.write("denver-" + year + ".json", JSON.stringify(items), "w");
});
casper.then(function() {
if (years.length > 0) {
casper.open("http://www.denvergov.org/sirepub/items.aspx");
casper.then(scrapeYear);
}
});
}
casper.start("http://www.denvergov.org/sirepub/items.aspx", scrapeYear);
casper.run();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment