Skip to content

Instantly share code, notes, and snippets.

@MarkR42
Created May 10, 2013 19:25
Show Gist options
  • Save MarkR42/5556769 to your computer and use it in GitHub Desktop.
Save MarkR42/5556769 to your computer and use it in GitHub Desktop.
This *did* crash phantomjs 1.9.0, but no longer does because the site rednekked.com has fixed itself.
http://www.rednekked.com/2013/04/30/planes-collide-near-la-1-crashes-1-lands-1-dead-news-12-long-island/
http://www.richjohnston.com
http://www.robertoaugustoblog.com
#!/usr/bin/env phantomjs
fs = require('fs');
/*
*
*
*/
/*
* These are hosts which don't provide interesting content.
* We block access to them.
*/
var HOST_BLACKLIST = [
'.googletagservices.com',
'.googleadservices.com',
'.doubleclick.net',
'.mediaplex.com',
'.apmebf.com',
'.googlesyndication.com',
'.google-analytics.com',
'.atdmt.com',
'.adnxs.com',
'rover.ebay.com',
'.atwola.com',
// Facebook:
'.fbcdn.net', '.facebook.com', '.facebook.net',
'.twitter.com', '.twimg.com',
'.addthis.com', // some social net
// More ads
'.globo.com','.glbimg.com',
];
function str_ends_with(str, suffix)
{
var suffixlen = suffix.length;
var strend = str.substr(- suffixlen, suffixlen);
return (strend == suffix);
}
function allow_url(url)
{
// Split into scheme, host, the rest
var pieces1 = url.split('://');
if (pieces1.length > 1) {
var pieces2 = pieces1[1].split('/');
var host = pieces2[0];
// Check host for ending on the blacklist.
host = host.toLowerCase();
var i;
for (i=0; i < HOST_BLACKLIST.length; i++) {
if (str_ends_with(host, HOST_BLACKLIST[i])) {
// Blacklisted.
return false;
}
}
}
// By default allow.
return true;
}
function remove_unwanted_elements(page)
{
page.evaluate(function () {
var removeIt = function(e) {
// Remove node from the dom, if it's in.
if (e.parentElement) {
e.parentElement.removeChild(e);
}
}
var removeByTag = function(tag) {
var elems = document.getElementsByTagName(tag);
var i;
for (i = elems.length - 1; i>= 0; i--) {
removeIt(elems[i]);
}
};
removeByTag('script');
// removeByTag('style');
removeByTag('noscript');
removeByTag('noframes');
} );
}
function start_fetch_page(url, finished_function) {
console.log("Starting up");
var page = require('webpage').create();
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;
page.timerid = 0; // Will be used later.
page.numrequests = 0;
page.noisy = false;
page.onResourceRequested = function(requestData, networkRequest) {
if (! allow_url(requestData.url) || requestData.method != 'GET') {
console.log("Denied: " + requestData.method + " to " + requestData.url);
networkRequest.abort();
return;
}
page.numrequests += 1;
};
// page.onResourceReceived = function(response) {
// console.log("Received: " + response.url);
// };
page.onUrlChanged = function(targetUrl) {
console.log('New URL: ' + targetUrl);
};
page.onConsoleMessage = function (msg) {
if (! page.noisy) {
return;
}
console.log('Log from page:' + msg);
};
page.onError = function(msg, trace) {
if (! page.noisy) {
return;
}
console.log("Error inside page:" + msg);
trace.forEach(function(item) {
console.log(' ', item.file, ':', item.line);
} );
};
function loadedfunc(status) {
// Cancel stop timer.
window.clearTimeout(page.timerid);
// We should pay attention now. Enable error handler.
page.noisy = true;
if (status !== "success") {
console.log("*** FAILED TO LOAD PAGE ***");
finished_function();
}
console.log("Loaded with " + page.numrequests + " requests");
var title = page.evaluate( function() {
return document.title;
} );
console.log("Title: " + title);
// Remove script and style elements.
// page.evaluate(function() { throw new Error("ERRTEST1"); } );
remove_unwanted_elements(page);
// Get the whole page body!
var body = page.evaluate( function () {
// Remove script and style elements from the page.
return document.documentElement.textContent;
} );
// console.log("Body:");
// console.log(body);
// page.render("page.png"); // Draw it!
finished_function();
page.close();
}
page.open(url, loadedfunc);
var stopFunc = function() {
console.log("Stopping loading");
page.stop();
}
var timerid = window.setTimeout(stopFunc, 15000);
page.timerid = timerid;
console.log("Finished creating request");
}
function main() {
/*
var url = "http://www.vectrex.org.uk";
if (phantom.args.length > 0) {
url = phantom.args[0];
}
*/
// Load urls list...
var f = fs.open(phantom.args[0], 'r');
var urls = [];
while (url = f.readLine()) {
if (url) {
urls.push(url);
}
}
console.log("Number of URLs:" + urls.length);
function start_next_page() {
if (urls.length == 0) {
console.log("No more URLS in queue");
phantom.exit();
} else {
var url = urls.pop();
start_fetch_page(url, start_next_page );
}
}
start_next_page();
}
main();
@MarkR42
Copy link
Author

MarkR42 commented May 10, 2013

NB: Run phantomtest.js with the crash_list.txt as its first argument. It will attempt to fetch each URL in the list, in turn.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment