motatoes/ExtractParliament.js

## extractHOL.js
// == This script extracts emails and other data about the UK house of lords from the following url:
// http://www.parliament.uk/mps-lords-and-offices/lords/s

var casper = require('casper').create();
var fs = require('fs');

// This array maintains a list of links to each HOL profile
var links = [];

// This array maintains the scraped information and is saved by the end of this script
var scrapedRows = [];

// == Some helper functions == //


// returns the selector element property if the selector exists but otherwise returns defaultValue
function querySelectorGet(selector, property, defaultValue) {
	var item = document.querySelector(selector);
	// Check that the item is not null
	item =  item ? item[property] : defaultValue;
	return item;
}

function getLinks() {
	var links = document.querySelectorAll('table a');
	return Array.prototype.map.call(links, function(e) {
		return e.getAttribute('href');
	});
}

function scrapLordDetails(querySelectorGet) {
	// Get the first 'a' tag that has a 'mailto' href value
	var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim()

	// The rest of the information can be extarcted via ID tags
	var title = querySelectorGet("div#lords-fulltitle", 'innerHTML', 'N/A').trim()
	var name = querySelectorGet("div#lords-name", 'innerHTML', 'N/A').trim()
	var party = querySelectorGet("div#lords-party-group", 'innerHTML', 'N/A').trim()
	var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim()
	var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A').trim()
	var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim()
	var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl01_pnlAddress', 'innerHTML', 'N/A').trim()

	// There are multiple social media links possibly ..
	var socialMediaATags = document.querySelectorAll('#web-social-media a');
	var socialMedia =  Array.prototype.map.call(socialMediaATags, function(e) {
		return e.getAttribute('href');
	})

	// Return all the rest of the information as a JSON string
	return {
		name: name,
		title: title,
		email: email,
		party: party,
		dateJoined: dateJoined,
		phone: phone,
		address1: address1,
		address2: address2,
		// There is no fixed number of social media so we join them with a semicolon
		socialMedia: socialMedia.join(' ; ')
	};
}


// ==\ Some helper functions \== //

// == Let the scraping begin == //


casper.start('http://www.parliament.uk/mps-lords-and-offices/lords/', function() {
	this.echo( 'Opened main site titled: ' + this.getTitle());
});

casper.then( function() {
	// aggregate all the links to the lord profiles
	links = this.evaluate(getLinks);

	this.echo('scraping links ...')
	// For each link
    casper.eachThen(links, function(response) {
    	casper.open(response.data).then(function() {
    		// We pass the querySelectorGet method to use it within the webpage context
    		var row = this.evaluate(scrapLordDetails, querySelectorGet);
    		scrapedRows.push(row);

    		// Stats display
    		this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length);
    	});
    });

});

casper.then(function() {
	// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/
	fs.write('lords.json', JSON.stringify(scrapedRows), 'w')
});

casper.run( function() {
	casper.exit();
});

## ExtractParliament.js
// == This script extracts emails and other data about the UK house of lords from the following url:
// http://www.parliament.uk/mps-lords-and-offices/mps/

var casper = require('casper').create();
var fs = require('fs');

// This array maintains a list of links to each HOL profile
var links = [];

// This array maintains the scraped information and is saved by the end of this script
var scrapedRows = [];

// == Some helper functions == //


// returns the selector element property if the selector exists but otherwise returns defaultValue
function querySelectorGet(selector, property, defaultValue) {
	var item = document.querySelector(selector);
	// Check that the item is not null
	item =  item ? item[property] : defaultValue;
	return item;
}

function getLinks() {
	var links = document.querySelectorAll('table a');
	return Array.prototype.map.call(links, function(e) {
		return e.getAttribute('href');
	});
}

function scrapLordDetails(querySelectorGet) {
	// Get the first 'a' tag that has a 'mailto' href value
	var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim()

	// The rest of the information can be extarcted via ID tags
	var title = querySelectorGet("div#commons-biography-header h1", 'innerHTML', 'N/A').trim()
	var name = querySelectorGet("div#commons-addressas", 'innerHTML', 'N/A').trim()
	var party = querySelectorGet("div#commons-party", 'innerHTML', 'N/A').trim()
	var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim()
	var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A')
	var constituency = querySelectorGet("div#commons-constituency", 'innerHTML', 'N/A').trim()
	var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim()
	var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone', 'innerHTML', 'N/A').trim()

	// There are multiple social media links possibly ..
	var socialMediaATags = document.querySelectorAll('#social-media a');
	var socialMedia =  Array.prototype.map.call(socialMediaATags, function(e) {
		return e.getAttribute('href');
	})
	// Return all the rest of the information as a JSON string
	return {
		name: name,
		title: title,
		email: email,
		party: party,
		dateJoined: dateJoined,
		phone: phone,
		constituency: constituency,
		address1: address1,
		address2: address2,
		// There is no fixed number of social media so we join them with a semicolon
		socialMedia: socialMedia.join(' ; ')
	};
}


// ==\ Some helper functions \== //

// == Let the scraping begin == //


casper.start('http://www.parliament.uk/mps-lords-and-offices/mps/', function() {
	this.echo( 'Opened main site titled: ' + this.getTitle());
});

casper.then( function() {
	// aggregate all the links to the lord profiles
	links = this.evaluate(getLinks);
	this.echo('scraping links ...')
	// For each link
    casper.eachThen(links, function(response) {

    	casper.open(response.data).then(function() {
    		// We pass the querySelectorGet method to use it within the webpage context
    		var row = this.evaluate(scrapLordDetails, querySelectorGet);
    		scrapedRows.push(row);

    		// Stats display
    		this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length);
    	});
    });

});

casper.then(function() {
	// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/
	fs.write('lords.json', JSON.stringify(scrapedRows), 'w')
});

casper.run( function() {
	casper.exit();
});
	// == This script extracts emails and other data about the UK house of lords from the following url:
	// http://www.parliament.uk/mps-lords-and-offices/lords/s

	var casper = require('casper').create();
	var fs = require('fs');

	// This array maintains a list of links to each HOL profile
	var links = [];

	// This array maintains the scraped information and is saved by the end of this script
	var scrapedRows = [];

	// == Some helper functions == //


	// returns the selector element property if the selector exists but otherwise returns defaultValue
	function querySelectorGet(selector, property, defaultValue) {
	var item = document.querySelector(selector);
	// Check that the item is not null
	item = item ? item[property] : defaultValue;
	return item;
	}

	function getLinks() {
	var links = document.querySelectorAll('table a');
	return Array.prototype.map.call(links, function(e) {
	return e.getAttribute('href');
	});
	}

	function scrapLordDetails(querySelectorGet) {
	// Get the first 'a' tag that has a 'mailto' href value
	var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim()

	// The rest of the information can be extarcted via ID tags
	var title = querySelectorGet("div#lords-fulltitle", 'innerHTML', 'N/A').trim()
	var name = querySelectorGet("div#lords-name", 'innerHTML', 'N/A').trim()
	var party = querySelectorGet("div#lords-party-group", 'innerHTML', 'N/A').trim()
	var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim()
	var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A').trim()
	var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim()
	var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl01_pnlAddress', 'innerHTML', 'N/A').trim()

	// There are multiple social media links possibly ..
	var socialMediaATags = document.querySelectorAll('#web-social-media a');
	var socialMedia = Array.prototype.map.call(socialMediaATags, function(e) {
	return e.getAttribute('href');
	})

	// Return all the rest of the information as a JSON string
	return {
	name: name,
	title: title,
	email: email,
	party: party,
	dateJoined: dateJoined,
	phone: phone,
	address1: address1,
	address2: address2,
	// There is no fixed number of social media so we join them with a semicolon
	socialMedia: socialMedia.join(' ; ')
	};
	}


	// ==\ Some helper functions \== //

	// == Let the scraping begin == //


	casper.start('http://www.parliament.uk/mps-lords-and-offices/lords/', function() {
	this.echo( 'Opened main site titled: ' + this.getTitle());
	});

	casper.then( function() {
	// aggregate all the links to the lord profiles
	links = this.evaluate(getLinks);

	this.echo('scraping links ...')
	// For each link
	casper.eachThen(links, function(response) {
	casper.open(response.data).then(function() {
	// We pass the querySelectorGet method to use it within the webpage context
	var row = this.evaluate(scrapLordDetails, querySelectorGet);
	scrapedRows.push(row);

	// Stats display
	this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length);
	});
	});

	});

	casper.then(function() {
	// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/
	fs.write('lords.json', JSON.stringify(scrapedRows), 'w')
	});

	casper.run( function() {
	casper.exit();
	});