Skip to content

Instantly share code, notes, and snippets.

@ryuone
Created April 4, 2011 12:52
Show Gist options
  • Save ryuone/901580 to your computer and use it in GitHub Desktop.
Save ryuone/901580 to your computer and use it in GitHub Desktop.
Node.js program. parseHTML and get Image files to save it.
/* node getImages.js http://www.yahoo.co.jp */
var htmlparser = require('htmlparser');
var sys = require('sys');
var http = require('http');
var fs = require('fs');
var url = require('url');
var path = require('path');
if(process.argv.length !== 3){
console.log('arguments error.');
return;
}
var parsedUrl = url.parse(process.argv[2]);
if(parsedUrl.host == null){
console.log('URL parsed Error.(' + process.argv[2] + ')');
return;
}
var httpClient = http.createClient(parsedUrl.port ? parsedUrl.port : 80, parsedUrl.hostname);
var request = httpClient.request('GET', parsedUrl.pathname, parsedUrl);
request.end();
var html = [];
request.on('response', function(res){
console.log("->Start");
console.log('STATUS : ' + res.statusCode);
console.log('HEADER : ' + JSON.stringify(res.headers));
res.setEncoding('utf8');
res.on('data', function(chunk){
html.push(chunk);
});
res.on('end', function(){
console.log("->End");
var myhtmlparser = new MyHtmlParser(parsedUrl);
myhtmlparser.parse(html.join(''));
myhtmlparser.getImageElements({tagName:'img'});
// myhtmlparser.getImageElements({tagName:'a'});
for(var i=0,imax=myhtmlparser.urls.length; i<imax; i++){
console.log(myhtmlparser.urls[i]);
myhtmlparser.getImage(url.parse(myhtmlparser.urls[i]), writeImageFile);
}
});
});
function writeImageFile(imageData, fname){
var imagedir = 'imagedir';
// var writeStream = fs.createWriteStream(fname, {flags:'w', encoding:null, mode:066});
// writeStream.on('open',function(){
// console.log('writeStream : open');
// console.log(arguments);
// writeStream.write(imageData, 'binary');
// // sys.pump(imageData, writeStream);
// writeStream.end();
// });
// writeStream.on('close',function(){
// console.log('writeStream : close');
// });
fs.stat(imagedir, function(error, stat){
if(error){
fs.mkdirSync(imagedir, 0766);
}
fs.writeFile(imagedir + '/' + fname, imageData, 'binary', function(error){
if(error){
console.log(error);
}
});
});
}
/* -------------------------------------------------------- */
var MyHtmlParser = function(parsedUrl){
this.parsedUrl = parsedUrl;
this.parsedResult = null;
this.urls = [];
if (this === global){
throw new Error("Error this is global.");
}
this.handler = new htmlparser.DefaultHandler(function (error, dom) {
if(error){
sys.debug("Error : " + error);
}
});
return this;
};
MyHtmlParser.prototype.parse = function(html){
var parser = new htmlparser.Parser(this.handler);
parser.parseComplete(html);
this.parsedResult = this.handler.dom;
};
MyHtmlParser.prototype.getImageElements = function(option){
option = option == null ? { tagName : 'img' } : option;
var tags = htmlparser.DomUtils.getElements({ tag_name: function(val){
return val.toLowerCase() === option.tagName ? true : false;
}}, this.parsedResult);
var url = null;
for(var i=1,imax=tags.length; i<imax; i++){
url = tags[i].attribs && (tags[i].attribs.href || tags[i].attribs.src);
if(url){
if(url.match(/.*\.(jpg)$/i)){
if(url.match(/^http:\/\/.*/)){
this.urls.push(url);
}else{
this.urls.push(this.parsedUrl.protocol + '//' +
this.parsedUrl.hostname +
url);
}
}
}
}
};
MyHtmlParser.prototype.getImage = function(parsedUrl, callback){
// var body = "";
var body = [];
var getImageBind = this.getImage.bind(this);
http.get({host:parsedUrl.host, path:parsedUrl.pathname},function(res){
res.setEncoding('binary');
res.on('data', function(chunk){
// body += chunk;
body.push(chunk);
});
res.on('end', function(){
if(res.statusCode === 302){
body = getImageBind(url.parse(res.headers.location), callback);
}else if(res.statusCode === 200){
callback(body.join(''), parsedUrl.pathname.match(".+/(.+?)$")[1]);
// callback(body, parsedUrl.pathname.match(".+/(.+?)$")[1]);
}else{
console.log(' -> ' + res.statusCode);
}
});
});
// console.log('finish->');
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment