Created
March 23, 2018 09:19
-
-
Save hiiwave/5db0b66e95fb9fc75e07a12b9fce286c to your computer and use it in GitHub Desktop.
Sample code of using phantomjs-node to scrape a web page fully loaded
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var phantom = require('phantom'); | |
var fs = require('fs'); | |
let pageScraper = (function () { | |
'use strict'; | |
let module = { | |
content: null, | |
onResourceRequested: function (requestData, request) { | |
// run in phantomjs runtime: https://github.com/amir20/phantomjs-node#pageon | |
// have no access to closure here | |
var verbose = true; | |
var ignore = false; | |
var ignoreReg = [/.*\.css/, /.*\.png/]; | |
ignoreReg.forEach(function (reg) { | |
if (reg.test(requestData.url)) { | |
// console.log('The url of the request is matching. Aborting: ' + requestData['url']); | |
request.abort(); | |
ignore = true; | |
} | |
}); | |
if (!ignore && verbose) { | |
console.log('Requesting', requestData.url); | |
} | |
}, | |
checkLoaded: function(page) { | |
return new Promise((resolve, reject) => { | |
setTimeout(() => { | |
resolve(); | |
}, 200); | |
}); | |
}, | |
waitPageLoaded: function(page, timeout=30000, period=250) { | |
return new Promise((resolve, reject) => { | |
let tried = 0; | |
let checkResolve = function () { | |
tried++; | |
console.log("Try: " + tried); | |
me.checkLoaded(page) | |
.then(() => { | |
resolve(); | |
}) | |
.catch(() => { | |
if (tried >= timeout / period) { | |
reject("waitPageLoaded: Time limit exceed"); | |
} else { | |
setTimeout(checkResolve, period); | |
} | |
}); | |
}; | |
checkResolve(); | |
}); | |
}, | |
scrape: function(url) { | |
var phInstance = null; | |
return new Promise(function (resolve, reject) { | |
phantom.create() | |
.then(instance => { | |
phInstance = instance; | |
return instance.createPage(); | |
}) | |
.then(page => { | |
// run in phantomjs runtime: https://github.com/amir20/phantomjs-node#pageon | |
page.on('onResourceRequested', true, me.onResourceRequested); | |
console.log("Start loading page.."); | |
page.open(url) | |
.then(status => { | |
if (status != "success") { | |
reject("status: " + status); | |
} else { | |
me.waitPageLoaded(page) | |
.then(() => { | |
page.property('content').then(content => { | |
me.content = content; | |
phInstance.exit(); | |
resolve(content); | |
}); | |
}) | |
.catch(error => { | |
phInstance.exit(); | |
reject(error); | |
}); | |
} | |
}); | |
}) | |
.catch(error => { | |
phInstance.exit(); | |
reject(error); | |
}); | |
}); | |
} | |
}; | |
let me = module; | |
return module; | |
})(); | |
// let url = "http://example.org/"; | |
let url = "https://treeofsavior.com/page/class/ranking.php"; | |
pageScraper.checkLoaded = function (page) { | |
return new Promise((resolve, reject) => { | |
page.evaluate(function () { | |
return document.getElementById('classbuild_date').innerHTML; | |
}).then(function (html) { | |
html = html.trim(); | |
console.info("innerhtml: " + html); | |
if (html != "") { | |
resolve(); | |
} else { | |
reject(); | |
} | |
}); | |
}); | |
}, | |
pageScraper.scrape(url) | |
.then(content => { | |
// console.log(content); | |
fs.writeFileSync('output.html', content); | |
console.log("File written to output.html"); | |
}) | |
.catch(error => { | |
console.log("ScrapeError: " + error); | |
}); | |
console.log("hello world"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment