Skip to content

Instantly share code, notes, and snippets.

@dezull
Created February 29, 2016 06:47
Show Gist options
  • Save dezull/e039bbfbc8157e27e016 to your computer and use it in GitHub Desktop.
Save dezull/e039bbfbc8157e27e016 to your computer and use it in GitHub Desktop.
Scrape & archive webpage
var fs = require('fs-extra'),
_ = require('lodash'),
Promise = require('bluebird'),
archiver = require('archiver'),
scraper = require('website-scraper');
var defaultOptions = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'
};
function zip(options) {
var zipPath = options.directory + '.zip';
var zipFile = fs.createWriteStream(zipPath);
var archive = function() {
return new Promise(function(resolve, reject) {
zipFile
.on('finish', resolve)
.on('error', reject);
archiver
.create('zip', {})
// It's very important we set the same date for each file,
// to ensure with zip file containing similar files will have
// the same checksum
.directory(options.directory, '/', { date: new Date(0) })
.on('error', reject)
.finalize()
.pipe(zipFile)
;
});
};
var removeDirectory = function() {
return new Promise(function(resolve, reject) {
fs.remove(options.directory, function(e) {
if (!e) return resolve();
reject(e);
});
});
};
return archive()
.catch(function(e) {
// Catch zipping error, remove the directory regardless,
// and rethrow the zipping error
return removeDirectory().then(function() {
throw e;
});
})
.then(function() { return removeDirectory(); })
.then(function(a) {
return Promise.resolve(zipPath);
})
;
}
function scrape(options) {
return scraper.scrape({
urls: [ options.url ],
directory: options.directory,
request: {
headers: {
'User-Agent': options['User-Agent']
}
}
});
}
/**
* Create a scraper.
*
* factoryOptions:
* {
* directory: baseDirectoryForWebpage (required)
* }
*
* @param {Object} factoryOptions See above
* @return {Object} scraper object
*/
exports.create = function(factoryOptions) {
return {
scrape: function(options) {
_.defaults(options, factoryOptions, defaultOptions);
options.directory = options.directory + '/' + options.id;
return scrape(options)
.then(function() { return zip(options); })
;
}
};
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment