Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Created June 29, 2014 10:48
Show Gist options
  • Save huzhifeng/77441b8d54d9b33a3eb6 to your computer and use it in GitHub Desktop.
Save huzhifeng/77441b8d54d9b33a3eb6 to your computer and use it in GitHub Desktop.
Node.js crawler for http://www.gifcool.com
var request = require('request');
var cheerio = require('cheerio');
//var Iconv = require('iconv').Iconv;
//var iconv = new Iconv('GBK', 'UTF-8//TRANSLIT//IGNORE');
var iconv = require('iconv-lite');
var util = require('util');
var baseUrl = 'http://www.gifcool.com';
var options = {
'url': '',
'method': 'GET',
'headers': {
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36',
'Connection': 'Keep-Alive',
'Host': 'www.gifcool.com'
},
'encoding': null,
//'proxy': 'http://127.0.0.1:7788'
};
function gbkDecode(data) {
return iconv.decode(data, 'gbk');//iconv.convert(body).toString();
}
function isFullUrl(url) {
return /^https?:\/\//.test(url);
}
function formatUrl(url) {
if (!url) {
return '';
}
if (isFullUrl(url)) {
return url;
} else {
if (url.charAt(0) === '/') {
return util.format('http://www.gifcool.com%s', url);
} else {
return util.format('http://www.gifcool.com/%s', url);
}
}
}
function crawlDetail(url) {
options.url = url;
request(options, function(err, res, body) {
console.log(util.format('Current detail: %s', url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
var $ = cheerio.load(gbkDecode(body));
var eImg = $('div.list_b div.ovh img').first();
var eDate = $('div.list_b div.ft span').last();
var eTitle = $('div.list_b h1.title').first();
if ((eImg.length !== 1) ||
(eDate.length !== 1) ||
(eTitle.length !== 1) ||
!eImg.attr('src') ||
!eDate.text().trim() ||
!eTitle.text().trim()) {
console.log('Invalid eImg, eDate or eTitle');
console.log(util.inspect({
'eImg': eImg,
'eDate': eDate,
'eTitle': eTitle
}, {depth: null}));
return;
}
var eAlt = $('div.list_b div.ovh td').first();
var title = $('title').text().trim() || eTitle.text().trim();
var date = eDate.text().trim();
var src = formatUrl(eImg.attr('src'));
var alt = eAlt.text().trim() || title || eImg.attr('alt');
if (!title || !date | !isFullUrl(src)) {
console.log('Invalid title, date or src');
console.dir({
'title': title,
'date': date,
'src': src
});
return;
}
var img = {
'src': src,
'alt': alt
};
var obj = {
'title': title,
'date': date,
'created': new Date(),
'tags': ['gif'],
'imgs': [img],
'link': url
};
console.dir(obj);
});
}
function crawlPager(url) {
options.url = url;
request(options, function(err, res, body) {
console.log(util.format('Current pager: %s', url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
var $ = cheerio.load(gbkDecode(body));
$('div.list_b h2.title').each(function() {
var eA = $(this).find('a').last();
var link = formatUrl($(eA).attr('href'));
if (isFullUrl(link)) {
setTimeout(crawlDetail, 1000, link);
} else {
console.log('Invalid link:' + link);
}
});
var nextUrl = '';
$('ul.pagelist a').each(function() {
if ($(this).text() === '下一页') {
if (url == baseUrl) {
nextUrl = util.format('http://www.gifcool.com/%s', $(this).attr('href'));
} else {
nextUrl = util.format('http://www.gifcool.com/index/%s', $(this).attr('href'));
}
}
});
if (isFullUrl(nextUrl)) {
setTimeout(crawlPager, 1000, nextUrl);
} else {
console.log(util.format('Last pager: %s', url));
}
});
}
function main() {
crawlPager(baseUrl);
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment