Skip to content

Instantly share code, notes, and snippets.

@hiiwave
Created March 23, 2018 09:19
Show Gist options
  • Save hiiwave/5db0b66e95fb9fc75e07a12b9fce286c to your computer and use it in GitHub Desktop.
Save hiiwave/5db0b66e95fb9fc75e07a12b9fce286c to your computer and use it in GitHub Desktop.
Sample code of using phantomjs-node to scrape a web page fully loaded
var phantom = require('phantom');
var fs = require('fs');
let pageScraper = (function () {
'use strict';
let module = {
content: null,
onResourceRequested: function (requestData, request) {
// run in phantomjs runtime: https://github.com/amir20/phantomjs-node#pageon
// have no access to closure here
var verbose = true;
var ignore = false;
var ignoreReg = [/.*\.css/, /.*\.png/];
ignoreReg.forEach(function (reg) {
if (reg.test(requestData.url)) {
// console.log('The url of the request is matching. Aborting: ' + requestData['url']);
request.abort();
ignore = true;
}
});
if (!ignore && verbose) {
console.log('Requesting', requestData.url);
}
},
checkLoaded: function(page) {
return new Promise((resolve, reject) => {
setTimeout(() => {
resolve();
}, 200);
});
},
waitPageLoaded: function(page, timeout=30000, period=250) {
return new Promise((resolve, reject) => {
let tried = 0;
let checkResolve = function () {
tried++;
console.log("Try: " + tried);
me.checkLoaded(page)
.then(() => {
resolve();
})
.catch(() => {
if (tried >= timeout / period) {
reject("waitPageLoaded: Time limit exceed");
} else {
setTimeout(checkResolve, period);
}
});
};
checkResolve();
});
},
scrape: function(url) {
var phInstance = null;
return new Promise(function (resolve, reject) {
phantom.create()
.then(instance => {
phInstance = instance;
return instance.createPage();
})
.then(page => {
// run in phantomjs runtime: https://github.com/amir20/phantomjs-node#pageon
page.on('onResourceRequested', true, me.onResourceRequested);
console.log("Start loading page..");
page.open(url)
.then(status => {
if (status != "success") {
reject("status: " + status);
} else {
me.waitPageLoaded(page)
.then(() => {
page.property('content').then(content => {
me.content = content;
phInstance.exit();
resolve(content);
});
})
.catch(error => {
phInstance.exit();
reject(error);
});
}
});
})
.catch(error => {
phInstance.exit();
reject(error);
});
});
}
};
let me = module;
return module;
})();
// let url = "http://example.org/";
let url = "https://treeofsavior.com/page/class/ranking.php";
pageScraper.checkLoaded = function (page) {
return new Promise((resolve, reject) => {
page.evaluate(function () {
return document.getElementById('classbuild_date').innerHTML;
}).then(function (html) {
html = html.trim();
console.info("innerhtml: " + html);
if (html != "") {
resolve();
} else {
reject();
}
});
});
},
pageScraper.scrape(url)
.then(content => {
// console.log(content);
fs.writeFileSync('output.html', content);
console.log("File written to output.html");
})
.catch(error => {
console.log("ScrapeError: " + error);
});
console.log("hello world");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment