|
var target = "https://thenounproject.com/"; // Our target URL |
|
|
|
var casper = require('casper').create({ |
|
verbose: true, |
|
logLevel: "info", |
|
pageSettings: { |
|
webSecurityEnabled: false, // (http://casperjs.readthedocs.org/en/latest/faq.html#i-m-having-hard-times-downloading-files-using-download) |
|
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" // Spoof being Chrome on a Mac (https://msdn.microsoft.com/en-us/library/ms537503(v=vs.85).aspx) |
|
} |
|
}); |
|
|
|
casper.start(target); // Start casper |
|
|
|
var scrolled = 0; // A variable to keep track of how much we have scrolled |
|
var scrollDelta = null; // Keep track of how much our new scroll position differs from our last |
|
|
|
var getContent = function() { |
|
casper.wait(1000, function() { // Wait 1s and then (http://casperjs.readthedocs.org/en/latest/modules/casper.html#wait) |
|
casper.scrollToBottom(); // scroll to the bottom (http://casperjs.readthedocs.org/en/latest/modules/casper.html#scrolltobottom) |
|
var newScrolled = casper.evaluate(function() { |
|
return window.scrollY; // grab how far the window is scrolled (https://developer.mozilla.org/en-US/docs/Web/API/Window/scrollY) |
|
}); |
|
scrollDelta = newScrolled - scrolled; // update scrollDelta |
|
scrolled = newScrolled; // and scrolled |
|
console.log("Now scrolled", scrolled); |
|
}); |
|
casper.then(function() { // After we scroll to the bottom (http://casperjs.readthedocs.org/en/latest/modules/casper.html#then) |
|
if (scrollDelta != 0) { // Check whether scrollDelta is zero, which means that we haven't scrolled any further |
|
getContent(); // If scrollDelta _has_ changed, recursively call getContent |
|
} else { |
|
casper.then(function() { // Otherwise |
|
console.log("Saving…"); |
|
var html = String(casper.getHTML()); // grab our HTML (http://casperjs.readthedocs.org/en/latest/modules/casper.html#gethtml) |
|
var filename = target.replace(/[^A-z]/g, ''); // create a sanitized filename by removing all the non A-Z characters (https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions) |
|
require('fs').write(filename + ".html", html, 'w'); // and save it to a file (https://docs.nodejitsu.com/articles/file-system/how-to-write-files-in-nodejs) |
|
console.log("…wrote HTML to", filename); |
|
}); |
|
} |
|
}); |
|
}; |
|
|
|
getContent(); // run our recursive function |
|
|
|
casper.run(); // and start casper |
Thank you so much. I had to add
getContent();
inside the Otherwise section for it to keep working forever, but it reaches a point and stops scrolling (The Now scrolled is still the same for 40 minutes now). I have confidence that the webpage I'm scraping is infinite scroll: it has too much content for the webpage. How do I solve that?