Skip to content

Instantly share code, notes, and snippets.

@motatoes
Last active June 27, 2017 23:31
Show Gist options
  • Save motatoes/792765736826cd3466a0 to your computer and use it in GitHub Desktop.
Save motatoes/792765736826cd3466a0 to your computer and use it in GitHub Desktop.
House of lords info scraping (casperJS)
// == This script extracts emails and other data about the UK house of lords from the following url:
// http://www.parliament.uk/mps-lords-and-offices/lords/s
var casper = require('casper').create();
var fs = require('fs');
// This array maintains a list of links to each HOL profile
var links = [];
// This array maintains the scraped information and is saved by the end of this script
var scrapedRows = [];
// == Some helper functions == //
// returns the selector element property if the selector exists but otherwise returns defaultValue
function querySelectorGet(selector, property, defaultValue) {
var item = document.querySelector(selector);
// Check that the item is not null
item = item ? item[property] : defaultValue;
return item;
}
function getLinks() {
var links = document.querySelectorAll('table a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
function scrapLordDetails(querySelectorGet) {
// Get the first 'a' tag that has a 'mailto' href value
var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim()
// The rest of the information can be extarcted via ID tags
var title = querySelectorGet("div#lords-fulltitle", 'innerHTML', 'N/A').trim()
var name = querySelectorGet("div#lords-name", 'innerHTML', 'N/A').trim()
var party = querySelectorGet("div#lords-party-group", 'innerHTML', 'N/A').trim()
var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim()
var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A').trim()
var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim()
var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl01_pnlAddress', 'innerHTML', 'N/A').trim()
// There are multiple social media links possibly ..
var socialMediaATags = document.querySelectorAll('#web-social-media a');
var socialMedia = Array.prototype.map.call(socialMediaATags, function(e) {
return e.getAttribute('href');
})
// Return all the rest of the information as a JSON string
return {
name: name,
title: title,
email: email,
party: party,
dateJoined: dateJoined,
phone: phone,
address1: address1,
address2: address2,
// There is no fixed number of social media so we join them with a semicolon
socialMedia: socialMedia.join(' ; ')
};
}
// ==\ Some helper functions \== //
// == Let the scraping begin == //
casper.start('http://www.parliament.uk/mps-lords-and-offices/lords/', function() {
this.echo( 'Opened main site titled: ' + this.getTitle());
});
casper.then( function() {
// aggregate all the links to the lord profiles
links = this.evaluate(getLinks);
this.echo('scraping links ...')
// For each link
casper.eachThen(links, function(response) {
casper.open(response.data).then(function() {
// We pass the querySelectorGet method to use it within the webpage context
var row = this.evaluate(scrapLordDetails, querySelectorGet);
scrapedRows.push(row);
// Stats display
this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length);
});
});
});
casper.then(function() {
// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/
fs.write('lords.json', JSON.stringify(scrapedRows), 'w')
});
casper.run( function() {
casper.exit();
});
// == This script extracts emails and other data about the UK house of lords from the following url:
// http://www.parliament.uk/mps-lords-and-offices/mps/
var casper = require('casper').create();
var fs = require('fs');
// This array maintains a list of links to each HOL profile
var links = [];
// This array maintains the scraped information and is saved by the end of this script
var scrapedRows = [];
// == Some helper functions == //
// returns the selector element property if the selector exists but otherwise returns defaultValue
function querySelectorGet(selector, property, defaultValue) {
var item = document.querySelector(selector);
// Check that the item is not null
item = item ? item[property] : defaultValue;
return item;
}
function getLinks() {
var links = document.querySelectorAll('table a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
function scrapLordDetails(querySelectorGet) {
// Get the first 'a' tag that has a 'mailto' href value
var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim()
// The rest of the information can be extarcted via ID tags
var title = querySelectorGet("div#commons-biography-header h1", 'innerHTML', 'N/A').trim()
var name = querySelectorGet("div#commons-addressas", 'innerHTML', 'N/A').trim()
var party = querySelectorGet("div#commons-party", 'innerHTML', 'N/A').trim()
var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim()
var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A')
var constituency = querySelectorGet("div#commons-constituency", 'innerHTML', 'N/A').trim()
var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim()
var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone', 'innerHTML', 'N/A').trim()
// There are multiple social media links possibly ..
var socialMediaATags = document.querySelectorAll('#social-media a');
var socialMedia = Array.prototype.map.call(socialMediaATags, function(e) {
return e.getAttribute('href');
})
// Return all the rest of the information as a JSON string
return {
name: name,
title: title,
email: email,
party: party,
dateJoined: dateJoined,
phone: phone,
constituency: constituency,
address1: address1,
address2: address2,
// There is no fixed number of social media so we join them with a semicolon
socialMedia: socialMedia.join(' ; ')
};
}
// ==\ Some helper functions \== //
// == Let the scraping begin == //
casper.start('http://www.parliament.uk/mps-lords-and-offices/mps/', function() {
this.echo( 'Opened main site titled: ' + this.getTitle());
});
casper.then( function() {
// aggregate all the links to the lord profiles
links = this.evaluate(getLinks);
this.echo('scraping links ...')
// For each link
casper.eachThen(links, function(response) {
casper.open(response.data).then(function() {
// We pass the querySelectorGet method to use it within the webpage context
var row = this.evaluate(scrapLordDetails, querySelectorGet);
scrapedRows.push(row);
// Stats display
this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length);
});
});
});
casper.then(function() {
// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/
fs.write('lords.json', JSON.stringify(scrapedRows), 'w')
});
casper.run( function() {
casper.exit();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment