Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Created June 29, 2014 06:52
Show Gist options
  • Save huzhifeng/e988a4d8cd5f01bd8c0f to your computer and use it in GitHub Desktop.
Save huzhifeng/e988a4d8cd5f01bd8c0f to your computer and use it in GitHub Desktop.
Node.js crawler for http://www.gaoxiaogif.com
var request = require('request');
var cheerio = require('cheerio');
//var Iconv = require('iconv').Iconv;
//var iconv = new Iconv('GBK', 'UTF-8//TRANSLIT//IGNORE');
var iconv = require('iconv-lite');
var util = require('util');
var baseUrl = 'http://www.gaoxiaogif.com';
var navItems = [
{
tid: 'zhenrengif',
tname: '真人'
},
{
tid: 'meinvgif',
tname: '美女'
},
{
tid: 'erdonggif',
tname: '儿童'
},
{
tid: 'dongwugif',
tname: '动物'
},
{
tid: 'tiyugif',
tname: '体育'
},
{
tid: 'jiaotongdongtai',
tname: '交通'
},
{
tid: 'kongbudongtaitupian',
tname: '恐怖'
},
{
tid: 'ogiftupian',
tname: '未分类'
},
];
function isFullUrl(url) {
return /^https?:\/\//.test(url);
}
function formatUrl(url) {
if (!url) {
return '';
}
if (isFullUrl(url)) {
return url;
} else {
if (url.charAt(0) === '/') {
return util.format('http://www.gaoxiaogif.com%s', url);
} else {
return util.format('http://www.gaoxiaogif.com/%s', url);
}
}
}
function crawlTag(entry) {
var url = entry.url;
var options = {
'url': url,
'method': 'GET',
'headers': {
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36',
'Connection': 'Keep-Alive',
'Host': 'www.gaoxiaogif.com'
},
'encoding': null,
//'proxy': 'http://127.0.0.1:7788'
};
request(options, function(err, res, body) {
console.log(util.format('Current page: %s', url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
body = iconv.decode(body, 'gbk');//iconv.convert(body).toString();
var $ = cheerio.load(body);
$('dl.listitem').each(function(index, listItem) {
var eTimeYM = $(listItem).find('span.ym').first();
var eTimeD = $(listItem).find('span.d').first();
var eTitle = $(listItem).find('h3 a').first();
var eA = $(listItem).find('div.cont a').first();
var eImg = $(listItem).find('div.cont img').first();
if ((eTimeYM.length !== 1) ||
(eTimeD.length !== 1) ||
(eTitle.length !== 1) ||
(eA.length !== 1) ||
(eImg.length !== 1)) {
console.log('Invalid listItem');
console.log(util.inspect($(listItem), {depth: null}));
return;
}
var date = util.format('%s/%s', eTimeYM.text().trim(), eTimeD.text().trim());
var isValidDate = /^\d{4}(\/\d{2}){2}$/.test(date);
if (!isValidDate) {
console.log('Invalid date:' + date);
console.dir({
'eTimeYM': eTimeYM,
'eTimeD': eTimeD
});
return;
}
var title = eTitle.text().trim() || eImg.attr('alt').trim();
var link = formatUrl(eTitle.attr('href'));
if (!title) {
console.log('Invalid title:' + title);
console.log(util.inspect({
'eTitle': eTitle,
'eImg': eImg
}, {depth: null}));
return;
}
var src = formatUrl(eA.attr('href'));
if (!src) {
console.log('Invalid src:' + src);
console.dir(eA);
return;
}
var img = {
'src': src,
'alt': eImg.attr('alt') || title
};
var thumbnailUrl = eImg.attr('src') || eImg.attr('original');
if (thumbnailUrl) {
img.thumbnail = thumbnailUrl;
}
var obj = {
'title': title,
'date': date,
'created': new Date(),
'tags': ['gif', entry.tname],
'imgs': [img],
'link': link
};
console.dir(obj);
});
var ePageList = $('.pagelist a');
var eNext = ePageList.eq(ePageList.length - 2);
if ((eNext.length === 1) &&
eNext.text() &&
(eNext.text() === '下一页')) {
var nextUrl = eNext.attr('href');
if (isFullUrl(nextUrl)) {
entry.url = nextUrl;
setTimeout(crawlTag, 1000, entry);
}
} else {
console.log(util.format('Last page of %s[%s]', entry.tname, entry.tid));
}
});
}
function main() {
navItems.forEach(function(entry, index, list) {
entry.url = util.format('%s/%s/', baseUrl, entry.tid);
setTimeout(crawlTag, index * 30 * 1000, entry);
});
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment