Skip to content

Instantly share code, notes, and snippets.

@onel0p3z
Created April 24, 2013 23:12
Show Gist options
  • Save onel0p3z/5456353 to your computer and use it in GitHub Desktop.
Save onel0p3z/5456353 to your computer and use it in GitHub Desktop.
Parses HTML files in folder "Templates", looks for IMG tags, gets SRC attribute, and downloads images to current folder. Not the best way but it worked for me. Also can create a file with links of images. //TIPS from http://maxogden.com/scraping-with-node.html
var $ = require('cheerio'),
_ = require('underscore'),
request = require('request'),
// If you want to create a file
images = [],
path = require('path'),
dir = path.join(__dirname,'\Templates'),
fs = require('fs'),
GetImage = function(file){
var HtmlFile = fs.readFileSync(file).toString(),
ParsedHtml = $.load(HtmlFile);
ParsedHtml('img').map(function(i,img){
var src = $(img).attr('src');
// Discard images containing not-good
if(!src.match('not-good')){
//images.push(src.toString());
var url = src.toString();
var imgName = url.slice(url.lastIndexOf('/')+1,url.length)
if(url.length != 0){
request(url).pipe(fs.createWriteStream(imgName));
}
}
});
},
finder = require('findit').find(dir, GetImage);
setTimeout(function(){
images = _.uniq(images);
// If you want to create a file
// fs.appendFileSync("output.txt", images.toString() + "\n");
// Output results
console.log(images);
console.log('images uniq: '+images.length);
},3000);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment