aresnick/_sync-scrape.md

## _sync-scrape.md

      
    Raw
  

              _sync-scrape.md
            
          
    Using CasperJS to synchronously download a series of dynamic pages

This is a short script to download a list of URLs which may have some dynamic content requiring the use of a headless browser tool like CasperJS to wait for content to load before scraping and writing the resulting HTML to a file.  You may find these two, other example gists useful as well.
You'll need to install CasperJS; on a Mac that will involve brew install casperjs --devel (if you don't have Homebrew, you can read how to do so here)
To run the script, you can use casperjs scrape.js --ssl-protocol=any from the command line.  Note that depending on the sites you're scraping, you may also need to use PhantomJS's cookie jar.

  
## sync.js
// A short script to synchronously download a series of URLs

// Our target URLs; note that if you're using https you'll probably need to invoke the script with `casperjs --ssl-protocol=any`
var targets = ["https://google.com", "https://yahoo.com", "https://bing.com"];

var casper = require('casper').create({ // Create and configure a casper object
    verbose: true,
    logLevel: "info",
    pageSettings: {
        webSecurityEnabled: false, // (http://casperjs.readthedocs.org/en/latest/faq.html#i-m-having-hard-times-downloading-files-using-download)
        userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" // Spoof being Chrome on a Mac (https://msdn.microsoft.com/en-us/library/ms537503(v=vs.85).aspx)
    }
});

var save = function(casper, filename) {
    downloading = true;
    console.log("Saving…");
    var html = String(casper.getHTML()); // grab our HTML (http://casperjs.readthedocs.org/en/latest/modules/casper.html#gethtml)
    require('fs').write(filename, html, 'w'); // and save it to a file (https://docs.nodejitsu.com/articles/file-system/how-to-write-files-in-nodejs)
    console.log("…wrote HTML to", filename);
    downloading = false;
};

var download = function(casper, target) { // A function to download a given URL
    casper.thenOpen(target); // using thenOpen to ensure we wait— http://casperjs.readthedocs.org/en/latest/modules/casper.html#thenopen
    casper.wait(1000, function() { // On example of waiting; could be replaced with waitForSelector, etc.
        console.log("Waiting 1000ms…");
        var filename = target.replace(/[^A-z0-9]/g, '').replace(/https?/, '') + '.html'; // generate a sanitized filename
        save(casper, filename);
    });
};


var currentTargetIndex = 0; // A counter variable for keeping track of which target we're on

casper.start(); // Configure and start casper; note we do this just once
casper.repeat(targets.length, function() { // Using http://casperjs.readthedocs.org/en/latest/modules/casper.html#repeat
    var target = targets[currentTargetIndex]; // grab our URL
    casper.thenOpen(target, function() { // then open our target
        download(casper, target); // download it
        currentTargetIndex += 1; // and move on to the next
    });
});
casper.run(function() {
    console.log("Done downloading", targets.length, "targets");
    casper.exit();
});
	// A short script to synchronously download a series of URLs

	// Our target URLs; note that if you're using https you'll probably need to invoke the script with `casperjs --ssl-protocol=any`
	var targets = ["https://google.com", "https://yahoo.com", "https://bing.com"];

	var casper = require('casper').create({ // Create and configure a casper object
	verbose: true,
	logLevel: "info",
	pageSettings: {
	webSecurityEnabled: false, // (http://casperjs.readthedocs.org/en/latest/faq.html#i-m-having-hard-times-downloading-files-using-download)
	userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" // Spoof being Chrome on a Mac (https://msdn.microsoft.com/en-us/library/ms537503(v=vs.85).aspx)
	}
	});

	var save = function(casper, filename) {
	downloading = true;
	console.log("Saving…");
	var html = String(casper.getHTML()); // grab our HTML (http://casperjs.readthedocs.org/en/latest/modules/casper.html#gethtml)
	require('fs').write(filename, html, 'w'); // and save it to a file (https://docs.nodejitsu.com/articles/file-system/how-to-write-files-in-nodejs)
	console.log("…wrote HTML to", filename);
	downloading = false;
	};

	var download = function(casper, target) { // A function to download a given URL
	casper.thenOpen(target); // using thenOpen to ensure we wait— http://casperjs.readthedocs.org/en/latest/modules/casper.html#thenopen
	casper.wait(1000, function() { // On example of waiting; could be replaced with waitForSelector, etc.
	console.log("Waiting 1000ms…");
	var filename = target.replace(/[^A-z0-9]/g, '').replace(/https?/, '') + '.html'; // generate a sanitized filename
	save(casper, filename);
	});
	};


	var currentTargetIndex = 0; // A counter variable for keeping track of which target we're on

	casper.start(); // Configure and start casper; note we do this just once
	casper.repeat(targets.length, function() { // Using http://casperjs.readthedocs.org/en/latest/modules/casper.html#repeat
	var target = targets[currentTargetIndex]; // grab our URL
	casper.thenOpen(target, function() { // then open our target
	download(casper, target); // download it
	currentTargetIndex += 1; // and move on to the next
	});
	});
	casper.run(function() {
	console.log("Done downloading", targets.length, "targets");
	casper.exit();
	});