public
Created

Save all images contained in a website domain/paths

  • Download Gist
package.json
JSON
1 2 3 4 5 6 7 8 9 10 11 12 13
{
"name": "rakeup",
"version": "0.0.1",
"author": "Fabrizio Codello",
"engines": {
"node": "0.6.x"
},
"private": true,
"dependencies": {
"http-agent": "0.1.x",
"jsdom": "0.2.x"
}
}
server.js
JavaScript
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
var httpAgent = require('http-agent'),
jsdom = require('jsdom'),
http = require('http'),
url = require('url'),
fs = require('fs'),
exec = require('child_process').exec,
child;
 
// check if directory exists
function dirExistsSync(dir) {
try {
return fs.statSync(dir).isDirectory();
} catch (err) {
return false
}
}
 
// retrieve a resource and save it
function saveImage(options, name) {
http.get(options, function(res) {
var imageData = '';
 
res.setEncoding('binary');
 
res.on('data', function(chunk){
imageData += chunk;
})
 
res.on('end', function() {
fs.writeFile(savePath + name, imageData, 'binary', function(err) {
if (err) throw err;
});
});
 
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
}
 
var domain = 'boards.4chan.org',
domainPaths = ['v', 'sci'],
savePath = './images/',
count = 0;
 
var agent = httpAgent.create(domain, domainPaths);
 
agent.addListener('next', function (err, agent) {
jsdom.env({
html: agent.body,
scripts: [
'http://code.jquery.com/jquery-1.7.2.min.js'
]
}, function (err, window) {
var $ = window.jQuery;
 
// grab all image URLs and save them on savePath
$('img').each(function(index) {
var imgSrc = $(this).attr('src');
// add http: as by default the path is //path
var imgPath = (~imgSrc.indexOf('http:') ? imgSrc : 'http:'+ imgSrc);
var fullURL = url.parse(imgPath);
 
var options = {
host: fullURL.host,
port: 80,
path: fullURL.path
};
 
var paths = fullURL.path.split('/'),
fileName = paths[paths.length - 1];
 
saveImage(options, fileName);
count++;
});
 
agent.next();
});
});
 
agent.addListener('stop', function(agent) {
console.log('* '+ count +' images saved.');
console.log('* Done.');
});
 
// create savePath if does not exists
if (!dirExistsSync(savePath)) {
fs.mkdir(savePath, function(err) {
if (err) throw err;
});
} else {
// if exists remove all directory contents
child = exec('rm -rf '+ savePath +'*');
}
 
console.log('* Grabbing all images from '+ domain +'.');
console.log('* Paths: /'+ domainPaths.join(' /') +'.');
console.log('* Download dir: '+ savePath);
agent.start();

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.