Skip to content

Instantly share code, notes, and snippets.

@shadedyin
Created March 16, 2014 02:34
Show Gist options
  • Save shadedyin/9577747 to your computer and use it in GitHub Desktop.
Save shadedyin/9577747 to your computer and use it in GitHub Desktop.
var http = require('http');
var cheerio = require('cheerio');
var numeral = require('numeral')();
var rsvp = require('rsvp');
var _ = require('underscore');
var Model = require('./model');
var Scraper = function(opts) {
var self = this;
self.onHTML = function($) { };
self.onJSON = function($) { };
_.extend(self, opts);
if(!url) {
throw new Error('Scraper: Cannot be created with out a "url".');
}
self.timeout = opts.timeout || 60000;
var crawl = function(url) {
var json = rsvp.defer();
var html = '';
http.get(url, function(res) {
res.on('data', function(chunk) {
html += chunk;
}).on('end', function() {
var $ = cheerio.load(html);
json.resolve(self.onHTML($));
});
}).on('error', function(error) { json.reject(error); });
return json.promise;
};
var run = function() {
crawl(url).then(function(json) {
self.onJSON(json);
setTimeout(run, self.timeout);
});
};
return run();
}
var url =
'http://staticresults.sos.la.gov/03152014/03152014_36_50211_Precinct.html'
var scraper = new Scraper({
url: url,
key: 'sheriff',
runners: [ 'foti', 'gussman' ],
rows: function($) { return $('table tr') },
cols: function($, row_el) { return $(row_el).children(); },
onHTML: function($) {
var self = this;
var json = { ward: { }, results: { } };
var rows = self.rows($);
rows.each(function(i, row_el) {
if(i === 0) { return; }
if(i === (rows.length - 1)) { return; }
var ward = null;
var precinct = null;
self.cols($, row_el).each(function(j, col_el) {
if(j === 0) {
var tmp = $(col_el).text().split(' ');
ward = Model.getWard(json, 'W' + tmp[0]);
precinct = Model.getPrecinct(ward, 'P' + tmp[1]);
} else {
var runner = self.runners[(j - 1) % 4];
var value = numeral.unformat($(col_el).text());
if(!json.results) {
json.results = { };
}
if(!ward.results) {
ward.results = { };
}
if(!precinct.results) {
precinct.results = { };
}
if(!json.results[self.key]) {
json.results[self.key] = { };
}
if(!ward.results[self.key]) {
ward.results[self.key] = { };
}
if(!precinct.results[self.key]) {
precinct.results[self.key] = { };
}
Model.set(precinct.results[self.key], runner, value);
Model.inc(ward.results[self.key], runner, value);
Model.inc(json.results[self.key], runner, value);
}
});
});
return json;
},
onJSON: function(json) {
Model.ref.child('results').set(json.results);
for(var ward in json.ward) {
var Ward = Model.ref.child('ward').child(ward);
for(var election in json.ward[ward].results) {
Ward.child('results').child(election)
.set(json.ward[ward].results[election]);
for(var precinct in json.ward[ward].precinct) {
var Precinct = Ward.child('precinct').child(precinct);
Precinct.child('results').child(election)
.set(json.ward[ward].precinct[precinct].results[election]);
}
}
};
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment