Skip to content

Instantly share code, notes, and snippets.

@s3131212
Created May 14, 2013 12:09
Show Gist options
  • Save s3131212/5575433 to your computer and use it in GitHub Desktop.
Save s3131212/5575433 to your computer and use it in GitHub Desktop.
{
"db":{
"host":"127.0.0.1",
"port":3306,
"user":"root",
"password":"",
"database":"crawler"
},
"baseSite":"http://s3131212.com/links/"
}
var fs = require('fs');
var ConfigLoader = {
_cache:null,
load:function(file){
var self = this;
this._cache = JSON.parse(fs.readFileSync(file).toString());
fs.watch(file,function(){
self._cache = JSON.parse(fs.readFileSync(file).toString());
});
return this._cache;
},
get:function(key){
return this._cache[key];
}
};
module.exports = ConfigLoader;
var request = require('request');
var cheerio = require('cheerio');
var async = require('async');
var db = require('mysql');
var util = require('./utils.js');
var config = require('./configLoader.js');
config.load(__dirname+'/config.json');
var Crawler = function(){
var self = this;
this.conn = db.createConnection(config.get('db'));
this.indexed = 0;
this.baseSite = config.get('baseSite');
this._url = this.baseSite;
this.url = this.baseSite;
this.crawl = function(cb){
this.conn.query('SELECT * FROM `queue` LIMIT 0,1',function(e,result){
self.url = result.length > 0 ? result[0].url : self.baseSite;
request(self.url,function(e,res,body){
if(!e && res.statusCode === 200){
self.getInfo(body);
}
else {
console.log('Error requesting page %s',self.url);
}
if(result.length > 0){
self.conn.query('DELETE FROM `queue` WHERE `id` = ?',[result[0].id],function(){
cb();
});
}
else {
cb();
}
self._url = self.url;
});
});
};
this.getInfo = function(html){
var $ = cheerio.load(html);
var title = $('head title').text();
var keywords = $('head meta[name=keywords]').attr('content');
var desc = $('head meta[name=description]').attr('content');
var links = $('a');
console.log('Crawling "%s" | %s',title,this.url);
async.map(links.map(function(){
return $(this).attr('href');
}).map(function(href){
if(href && href != self._url && !(/^#(\w)+/.test(href)) && !util.imageRegexp.test(href)){
if(util.isExternal(href)){
return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(href);
}
else {
return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(util.resolveRelativeURL(href,self._url));
}
}
return false;
}).filter(function(el){
return !!el;
})
,this.conn.query.bind(this.conn),function(e,result){
if(e){
console.log('Error writing queue.');
console.log(e);
}
});
this.conn.query('INSERT INTO `websites` SET ?',{
id:util.id(),
url:this.url,
from:this._url,
title:title,
keywords:keywords || '',
desc:desc || ''
},function(e){
if(e){
console.log('Error indexing page %s',self._url);
console.log(e);
}
else {
console.log('Successfully indexed page %s',self._url);
self.indexed++;
}
});
};
};
module.exports = Crawler;
var Crawler = require('./crawler.js');
var async = require('async');
var spider = new Crawler();
async.forever(function(cb){
spider.crawl(function(){
cb(null);
});
});
var path = require('path');
var Utils = {
id:function(){
var ret = '';
for(var chars="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".split(''),i=0;i<75;i++){
ret += chars[~~(Math.random() * chars.length)];
}
return ret;
},
resolveRelativeURL:function(p,url){
var proto = url.match(/^https?/)[0]; //extract the protocol out
url = url.replace(/^https?:\/\//,''); //remove the protocol
return proto+'://'+path.normalize(url+'/'+p) //find out the absolute URL
.replace(path.sep,'/') //replace blackslash with forward slash in Windows
.replace(/#(\w)+/,''); //remove URL fragment
},
isExternal:function(url){
return url.match(/^https?/) !== null;
},
imageRegexp: new RegExp("("+['\\.png','\\.jpg','\\.gif','\\.bmp','\\.psd'].join('|')+")$","i")
};
module.exports = Utils;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment