Skip to content

Instantly share code, notes, and snippets.

@Macxim
Created November 6, 2017 13:51
Show Gist options
  • Save Macxim/b3cf74d8fced60d8f26a09a4f6b37213 to your computer and use it in GitHub Desktop.
Save Macxim/b3cf74d8fced60d8f26a09a4f6b37213 to your computer and use it in GitHub Desktop.
Web scrapping paginated web page with Nightmare.js
/**
* Scrape data from https://london.wtm.com/en/exhibitor-directory-2017/2017-Products
*
* Usage: $ node index.js
*/
var Nightmare = require('nightmare');
var vo = require('vo');
var fs = require('fs');
vo(run)(function(err, result) {
if (err) throw err;
});
function flatten(arr) {
var ret = [];
for(var i = 0; i < arr.length; i++) {
if(Array.isArray(arr[i])) {
ret = ret.concat(flatten(arr[i]));
} else {
ret.push(arr[i]);
}
}
return ret;
// [ [ a ], [ b ], [ c ] ] => [a, b, c]
}
function outputToCsv(linksArr) {
var csvData = [], lineData = [];
var header = 'URL, Company, Website, Facebook, Phone';
csvData.push(header);
for (var i= 0; i < linksArr.length; i++) {
lineData = [];
lineData.push(linksArr[i]);
csvData.push(lineData.join('\r\n'));
}
csvData = csvData.join("\r\n");
fs.writeFileSync('./missing-stuff.csv', csvData, function (err) {
if (err) return console.log(err);
});
}
function enquote(val) {
if (arguments.length == 0 || val == null) {
return '""';
}
return '"'+val.toString().replace(/\"/gm,'""')+'"';
}
function* run() {
var nightmare = Nightmare(),
MAX_PAGE = 300,
currentPage = 0,
links = [],
companies = [],
websites = [],
fbPages = [],
phoneNumbers = [],
data = [];
// Let's get all links from that address below
yield nightmare
.goto('http://london.wtm.com/en/exhibitor-directory-2017/?rpp=12&d=103087|152_214625')
.wait(3000)
// Check if `next` button exists
nextExists = yield nightmare.visible('.pagination .gButton a');
while (nextExists && currentPage < MAX_PAGE) {
links.push(yield nightmare
.evaluate(function() {
var links = document.querySelectorAll('.resultItem.exhibitor h3 a');
return Array.prototype.map.call(links, function(e) {
var href = e.getAttribute('href');
var url = 'https://london.wtm.com' + href;
return url;
});
})
);
yield nightmare
.click('.pagination .gButton a')
.wait(5000)
currentPage++;
nextExists = yield nightmare.visible('.pagination .gButton a');
}
// There is only one page
links.push(yield nightmare
.evaluate(function() {
var links = document.querySelectorAll('.resultItem.exhibitor h3 a');
return Array.prototype.map.call(links, function(e) {
var href = e.getAttribute('href');
var url = 'https://london.wtm.com' + href;
return url;
});
})
);
// We want a clean array of links
links = flatten(links)
if (links.length > 0) {
console.log('ℹ️ ' + links.length + ' links found. \r');
console.log('🔥 Let\'s do this! Please be aware that this could take a while... Be patient. \n');
console.log('📕 Retrieving companies names...\r');
for (var i = 0; i < links.length; i++) {
yield nightmare
.goto(links[i])
// Check if company name exists
var hasCompanyName = yield nightmare.exists('h2.exhibitorName')
if(hasCompanyName){
yield nightmare
.evaluate(() => document.querySelector('h2.exhibitorName').innerText)
.then((el) => {
companies.push(el);
})
} else {
companies.push("N/A");
}
}
console.log('📒 Retrieving companies websites...\r');
for (var i = 0; i < links.length; i++) {
yield nightmare
.goto(links[i])
// Check if company has website
var hasWebsite = yield nightmare.exists('.socialNetworkProfiles .link a')
if(hasWebsite){
yield nightmare
.evaluate(() => document.querySelector('.socialNetworkProfiles .link a').href)
.then((el) => {
websites.push(el);
})
} else {
websites.push("N/A");
}
}
console.log('📘 Retrieving companies Facebook Page...\r');
for (var i = 0; i < links.length; i++) {
yield nightmare
.goto(links[i])
// Check if company has Facebook Page
var hasFbPage = yield nightmare.exists('.socialNetworkProfiles .facebook a')
if(hasFbPage){
yield nightmare
.evaluate(() => document.querySelector('.socialNetworkProfiles .facebook a').href)
.then((el) => {
fbPages.push(el);
})
} else {
fbPages.push("N/A");
}
}
console.log('☎️ Retrieving companies phone numbers...\r');
for (var i = 0; i < links.length; i++) {
yield nightmare
.goto(links[i])
var hasPhoneNumber = yield nightmare.exists('.vcard .tel .value')
if(hasPhoneNumber){
yield nightmare
.evaluate(() => document.querySelector('.vcard .tel .value').innerText)
.then((el) => {
phoneNumbers.push(el);
})
} else {
phoneNumbers.push("N/A");
}
}
var arrays = [links, companies, websites, fbPages, phoneNumbers];
var data = Array.from(arrays[0], (_, i) => arrays.map(e => [e[i]]))
console.log('📝 Saving results in .csv file...\n');
outputToCsv(data);
console.log('🤑 Go make some money now!');
yield nightmare.end();
}
else {
console.log('⚠️ Sorry. No links found. Exiting.');
yield nightmare.end();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment