Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Created June 29, 2014 10:53
Show Gist options
  • Save huzhifeng/c8a37d34c4e23cd2707d to your computer and use it in GitHub Desktop.
Save huzhifeng/c8a37d34c4e23cd2707d to your computer and use it in GitHub Desktop.
Node.js crawler for http://www.youqu5.com
var request = require('request');
var cheerio = require('cheerio');
//var Iconv = require('iconv').Iconv;
//var iconv = new Iconv('GBK', 'UTF-8//TRANSLIT//IGNORE');
var iconv = require('iconv-lite');
var util = require('util');
var baseUrl = 'http://www.youqu5.com';
var navItems = [
{
tid: 'qutu',
tname: '雷人趣图'
},
{
tid: 'gaoxiaotupian',
tname: '爆笑图片'
},
{
tid: 'neihantu',
tname: '邪恶内涵图'
},
{
tid: 'gaoxiaogif',
tname: '搞笑图片动态'
},
{
tid: 'chuanbangjingtou',
tname: '穿帮镜头'
},
{
tid: 'PSegaoqutu',
tname: '恶搞图片'
},
{
tid: 'shijiezhizui',
tname: '世界之最'
},
{
tid: 'yangyantupian',
tname: '养眼图片'
},
{
tid: 'gaoxiaomeinv',
tname: '搞笑美女'
},
];
var options = {
'url': '',
'method': 'GET',
'headers': {
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36',
'Connection': 'Keep-Alive',
'Host': 'www.youqu5.com'
},
'encoding': null,
//'proxy': 'http://127.0.0.1:7788'
};
function gbkDecode(data) {
return iconv.decode(data, 'gbk');//iconv.convert(body).toString();
}
function isFullUrl(url) {
return /^https?:\/\//.test(url);
}
function formatUrl(url) {
if (!url) {
return '';
}
if (isFullUrl(url)) {
return url;
} else {
if (url.charAt(0) === '/') {
return util.format('http://www.youqu5.com%s', url);
} else {
return util.format('http://www.youqu5.com/%s', url);
}
}
}
function crawlDetail(url) {
options.url = url;
request(options, function(err, res, body) {
console.log(util.format('Current detail: %s', url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
var $ = cheerio.load(gbkDecode(body));
var eImgs = $('div.tcontent img');
var eDate = $('div.info p.time').first();
var eTitle = $('div.info h1').first();
if ((eImgs.length < 1) ||
(eDate.length !== 1) ||
(eTitle.length !== 1) ||
!eDate.text().trim() ||
!eTitle.text().trim()) {
console.log('Invalid eImgs, eDate or eTitle');
console.log(util.inspect({
'eImgs': eImgs,
'eDate': eDate,
'eTitle': eTitle
}, {depth: null}));
return;
}
var title = eTitle.text().trim() || $('title').text().trim();
var date = eDate.text().trim();
var match = date.match(/(\d{4}-\d{2}-\d{2})/);
if (match && (match.length === 2)) {
date = match[1];
} else {
date = '';
}
if (!title || !date) {
console.log('Invalid title or date');
console.dir({
'title': title,
'date': date
});
return;
}
var imgs = [];
$(eImgs).each(function() {
var src = formatUrl($(this).attr('src'));
var alt = $(this).attr('title');
if (!isFullUrl(src)) {
console.log('Invalid src:' + src);
console.dir($(this));
return;
}
var img = {
'src': src
};
if (alt) {
img.alt = alt;
}
imgs.push(img);
});
if (imgs.length === 0) {
console.log('Invalid imgs');
return;
}
var obj = {
'title': title,
'date': date,
'created': new Date(),
'tags': ['gif'],
'imgs': imgs,
'link': url
};
console.dir(obj);
});
}
function crawlTag(entry) {
options.url = entry.url;
request(options, function(err, res, body) {
console.log(util.format('Current page: %s', entry.url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
var $ = cheerio.load(gbkDecode(body));
$('div.box-l-m ul.pic-m li a.preview').each(function() {
var link = $(this).attr('href');
if (isFullUrl(link)) {
setTimeout(crawlDetail, 1000, link);
}
});
var nextUrl = '';
$('div.pagelist a').each(function() {
if ($(this).text() === '下一页') {
nextUrl = $(this).attr('href');
}
});
if (isFullUrl(nextUrl)) {
entry.url = nextUrl;
setTimeout(crawlTag, 1000, entry);
} else {
console.log(util.format('Last page of %s[%s]', entry.tname, entry.tid));
}
});
}
function main() {
navItems.forEach(function(entry, index, list) {
entry.url = util.format('%s/%s/', baseUrl, entry.tid);
setTimeout(crawlTag, index * 30 * 1000, entry);
});
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment