Created
March 7, 2015 21:36
-
-
Save peytonm/8b9670a76102f4c8644a to your computer and use it in GitHub Desktop.
Scraping Denver Legislation with CasperJS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var casper = require("casper").create(); | |
var x = require('casper').selectXPath; | |
var fs = require("fs"); | |
// Trying to scrape everything doesn't seem to work; eventually the page | |
// just goes blank. It makes sense to scrape by year anway--that way | |
// each year can go in a separate JSON file. | |
var years = [2010, 2011, 2013, 2014]; | |
var ids = []; | |
var items = []; | |
// Scrape the item IDs on the paginated year searches, until | |
// there are no more. | |
function scrapeIds() { | |
if (casper.exists("tr[onclick]")) { | |
var itemRows = this.getElementsInfo("tr[onclick]"); | |
for (var i = 0; i < itemRows.length; ++i) { | |
ids.push(itemRows[i].attributes.onclick.match(/\d+/)); | |
} | |
if (casper.exists(x("//*[text()='Next >']"))) { | |
casper.click(x("//*[text()='Next >']")); | |
casper.then(scrapeIds); | |
} | |
} | |
} | |
// Obviously there's more info to be scraped, but this is a start | |
function scrapeItems() { | |
var id = ids.pop(); | |
casper.open("http://www.denvergov.org/sirepub/item.aspx?itemid=" + id); | |
casper.then(function() { | |
if (casper.exists("#ItemHistory_lblTitle")) { | |
var version = 1; | |
// There are multiple versions of some items. If we're looking at | |
// the first one, add the rest to the queue. | |
if (casper.exists("#ItemHistory_tblHistory tr.currenthistoryitem")) { | |
version = +casper.getElementInfo("#ItemHistory_tblHistory tr.currenthistoryitem td:nth-child(2)").html.match(/\d+/); | |
if (version === 1 && casper.exists("a[title='Open this version']")) { | |
var links = this.getElementsAttribute("a[title='Open this version']", "href"); | |
for (var i = 0; i < links.length; ++i) { | |
ids.push(links[i].match(/\d+/)); | |
} | |
} | |
} | |
var title = casper.getElementInfo("#ItemHistory_lblTitle").text; | |
var num = casper.getElementInfo("#ItemHistory_lblTrackingNumber").text; | |
var type = casper.getElementInfo("#ItemHistory_lblType").text; | |
var date = casper.getElementInfo("#ItemHistory_lblMeetingDate > a").text; | |
var result = "unknown"; | |
if (casper.exists("span.motionpassed")) { | |
result = "passed"; | |
} else if (casper.exists("span.motionfailed")) { | |
result = "failed"; | |
} | |
// Some items don't have (recorded) votes, so this could be null | |
var votes = casper.evaluate(function() { | |
var votes = null; | |
if (document.querySelector(".attendee")) { | |
var rows = document.querySelectorAll("#ItemHistory_tblVoting table table tr"); | |
votes = []; | |
for (var i = 0; i < rows.length; ++i) { | |
votes.push({ | |
"member": rows[i].querySelector(".attendee").innerText, | |
"vote": rows[i].querySelector(".votefor").innerText | |
}); | |
} | |
} | |
return votes; | |
}); | |
items.push({ | |
"title": title, | |
"num": num, | |
"type": type, | |
"date": date, | |
"version": version, | |
"result": result, | |
"votes": votes | |
}); | |
} else { | |
// Sometimes we get an error | |
console.log("Failed: " + id); | |
} | |
if (ids.length > 0) { | |
scrapeItems(); | |
} | |
}); | |
} | |
function scrapeYear() { | |
var year = years.pop(); | |
this.capture("denver-" + year + ".png"); | |
this.fill("form#form1", {"dd_meet_date": year}); | |
this.click("#Itemsmplsrch1_btnSearch"); | |
// Upon clicking, the button will say "Searching...", then it'll | |
// revert back to "Search" when the results are ready | |
casper.waitFor(function check() { | |
return this.evaluate(function() { | |
return document.getElementById("Itemsmplsrch1_btnSearch").value === "Search"; | |
}); | |
}, scrapeIds); | |
casper.then(scrapeItems); | |
casper.then(function() { | |
fs.write("denver-" + year + ".json", JSON.stringify(items), "w"); | |
}); | |
casper.then(function() { | |
if (years.length > 0) { | |
casper.open("http://www.denvergov.org/sirepub/items.aspx"); | |
casper.then(scrapeYear); | |
} | |
}); | |
} | |
casper.start("http://www.denvergov.org/sirepub/items.aspx", scrapeYear); | |
casper.run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment