Created
May 10, 2013 19:25
-
-
Save MarkR42/5556769 to your computer and use it in GitHub Desktop.
This *did* crash phantomjs 1.9.0, but no longer does because the site rednekked.com has fixed itself.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
http://www.rednekked.com/2013/04/30/planes-collide-near-la-1-crashes-1-lands-1-dead-news-12-long-island/ | |
http://www.richjohnston.com | |
http://www.robertoaugustoblog.com |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env phantomjs | |
fs = require('fs'); | |
/* | |
* | |
* | |
*/ | |
/* | |
* These are hosts which don't provide interesting content. | |
* We block access to them. | |
*/ | |
var HOST_BLACKLIST = [ | |
'.googletagservices.com', | |
'.googleadservices.com', | |
'.doubleclick.net', | |
'.mediaplex.com', | |
'.apmebf.com', | |
'.googlesyndication.com', | |
'.google-analytics.com', | |
'.atdmt.com', | |
'.adnxs.com', | |
'rover.ebay.com', | |
'.atwola.com', | |
// Facebook: | |
'.fbcdn.net', '.facebook.com', '.facebook.net', | |
'.twitter.com', '.twimg.com', | |
'.addthis.com', // some social net | |
// More ads | |
'.globo.com','.glbimg.com', | |
]; | |
function str_ends_with(str, suffix) | |
{ | |
var suffixlen = suffix.length; | |
var strend = str.substr(- suffixlen, suffixlen); | |
return (strend == suffix); | |
} | |
function allow_url(url) | |
{ | |
// Split into scheme, host, the rest | |
var pieces1 = url.split('://'); | |
if (pieces1.length > 1) { | |
var pieces2 = pieces1[1].split('/'); | |
var host = pieces2[0]; | |
// Check host for ending on the blacklist. | |
host = host.toLowerCase(); | |
var i; | |
for (i=0; i < HOST_BLACKLIST.length; i++) { | |
if (str_ends_with(host, HOST_BLACKLIST[i])) { | |
// Blacklisted. | |
return false; | |
} | |
} | |
} | |
// By default allow. | |
return true; | |
} | |
function remove_unwanted_elements(page) | |
{ | |
page.evaluate(function () { | |
var removeIt = function(e) { | |
// Remove node from the dom, if it's in. | |
if (e.parentElement) { | |
e.parentElement.removeChild(e); | |
} | |
} | |
var removeByTag = function(tag) { | |
var elems = document.getElementsByTagName(tag); | |
var i; | |
for (i = elems.length - 1; i>= 0; i--) { | |
removeIt(elems[i]); | |
} | |
}; | |
removeByTag('script'); | |
// removeByTag('style'); | |
removeByTag('noscript'); | |
removeByTag('noframes'); | |
} ); | |
} | |
function start_fetch_page(url, finished_function) { | |
console.log("Starting up"); | |
var page = require('webpage').create(); | |
page.settings.javascriptEnabled = true; | |
page.settings.loadImages = false; | |
page.timerid = 0; // Will be used later. | |
page.numrequests = 0; | |
page.noisy = false; | |
page.onResourceRequested = function(requestData, networkRequest) { | |
if (! allow_url(requestData.url) || requestData.method != 'GET') { | |
console.log("Denied: " + requestData.method + " to " + requestData.url); | |
networkRequest.abort(); | |
return; | |
} | |
page.numrequests += 1; | |
}; | |
// page.onResourceReceived = function(response) { | |
// console.log("Received: " + response.url); | |
// }; | |
page.onUrlChanged = function(targetUrl) { | |
console.log('New URL: ' + targetUrl); | |
}; | |
page.onConsoleMessage = function (msg) { | |
if (! page.noisy) { | |
return; | |
} | |
console.log('Log from page:' + msg); | |
}; | |
page.onError = function(msg, trace) { | |
if (! page.noisy) { | |
return; | |
} | |
console.log("Error inside page:" + msg); | |
trace.forEach(function(item) { | |
console.log(' ', item.file, ':', item.line); | |
} ); | |
}; | |
function loadedfunc(status) { | |
// Cancel stop timer. | |
window.clearTimeout(page.timerid); | |
// We should pay attention now. Enable error handler. | |
page.noisy = true; | |
if (status !== "success") { | |
console.log("*** FAILED TO LOAD PAGE ***"); | |
finished_function(); | |
} | |
console.log("Loaded with " + page.numrequests + " requests"); | |
var title = page.evaluate( function() { | |
return document.title; | |
} ); | |
console.log("Title: " + title); | |
// Remove script and style elements. | |
// page.evaluate(function() { throw new Error("ERRTEST1"); } ); | |
remove_unwanted_elements(page); | |
// Get the whole page body! | |
var body = page.evaluate( function () { | |
// Remove script and style elements from the page. | |
return document.documentElement.textContent; | |
} ); | |
// console.log("Body:"); | |
// console.log(body); | |
// page.render("page.png"); // Draw it! | |
finished_function(); | |
page.close(); | |
} | |
page.open(url, loadedfunc); | |
var stopFunc = function() { | |
console.log("Stopping loading"); | |
page.stop(); | |
} | |
var timerid = window.setTimeout(stopFunc, 15000); | |
page.timerid = timerid; | |
console.log("Finished creating request"); | |
} | |
function main() { | |
/* | |
var url = "http://www.vectrex.org.uk"; | |
if (phantom.args.length > 0) { | |
url = phantom.args[0]; | |
} | |
*/ | |
// Load urls list... | |
var f = fs.open(phantom.args[0], 'r'); | |
var urls = []; | |
while (url = f.readLine()) { | |
if (url) { | |
urls.push(url); | |
} | |
} | |
console.log("Number of URLs:" + urls.length); | |
function start_next_page() { | |
if (urls.length == 0) { | |
console.log("No more URLS in queue"); | |
phantom.exit(); | |
} else { | |
var url = urls.pop(); | |
start_fetch_page(url, start_next_page ); | |
} | |
} | |
start_next_page(); | |
} | |
main(); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
NB: Run phantomtest.js with the crash_list.txt as its first argument. It will attempt to fetch each URL in the list, in turn.