Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Last active August 29, 2015 14:03
Show Gist options
  • Save huzhifeng/b8f00637fe271eecb111 to your computer and use it in GitHub Desktop.
Save huzhifeng/b8f00637fe271eecb111 to your computer and use it in GitHub Desktop.
Node.js crawler for http://forgifs.com
var request = require('request');
var cheerio = require('cheerio');
var util = require('util');
var baseUrl = 'http://forgifs.com';
function formatSrc(src) {
if (src.charAt(0) === '/') {
return util.format('%s%s', baseUrl, src);
} else if (src.indexOf('http://') === -1) {
return util.format('%s/%s', baseUrl, src);
} else {
return src;
}
}
function formatDate(d) {
var l = d.split(':'); // Date: 03/31/2009
if (l.length === 2) {
return l[1].trim();
} else {
return d;
}
}
// Original gif src: /gallery/d/218815-1/Lizard-vs-centipede.gif
// Thumbnail src: http://forgifs.com/gallery/d/218816-2/Lizard-vs-centipede.gif
function getThumbnailUrl(src) {
var pattern = /^\/gallery\/d\/(\d{3,6})-(\d{1,2})\/.*\.gif$/i;
var match = src.match(pattern);
var ruleless = {
// Original src: Thumbnail src
'/gallery/d/68120-5/Bowling-celebration-fail': '/gallery/d/68121-5/Bowling-celebration-fail',
'/gallery/d/162692-9/Bert-snaps.gif': '/gallery/d/162693-4/Bert-snaps.gif',
'/gallery/d/129655-9/News-reporter-videobombed_001.gif': '/gallery/d/129656-4/News-reporter-videobombed_001.gif',
'/gallery/d/66944-9/Roller-coaster-fail-trapped.gif': '/gallery/d/66945-7/Roller-coaster-fail-trapped.gif',
'/gallery/d/83077-9/Kid-kicking-ball-fail-QWOP.gif': '/gallery/d/83078-6/Kid-kicking-ball-fail-QWOP.gif',
'/gallery/d/71404-9/Lizard-attacks-reporter.gif': '/gallery/d/71405-7/Lizard-attacks-reporter.gif',
'/gallery/d/84325-9/Bowling-fail-sprinklers.gif': '/gallery/d/84326-6/Bowling-fail-sprinklers.gif',
'/gallery/d/81522-9/Boxer-vs-bullies.gif': '/gallery/d/81523-8/Boxer-vs-bullies.gif',
'/gallery/d/170176-9/Heaven-gates.gif': '/gallery/d/170177-4/Heaven-gates.gif',
'/gallery/d/165044-9/Cthulhu-shirt-shake.gif': '/gallery/d/165045-10/Cthulhu-shirt-shake.gif',
'/gallery/d/84702-9/Alba_finger_suck.gif': '/gallery/d/84703-4/Alba_finger_suck.gif',
'/gallery/d/77026-9/Horse_slides.gif': '/gallery/d/77027-5/Horse_slides.gif',
'/gallery/d/79735-9/Treadmill_gangsta.gif': '/gallery/d/79736-3/Treadmill_gangsta.gif'
};
if(!match || (match.length !== 3)) {
if (ruleless.hasOwnProperty(src)) {
console.log(util.format('Hit in ruleless[%s]: [%s]', src, ruleless[src]));
return formatSrc(ruleless[src]);
} else {
console.log(util.format('Error, ignore unmatched src: %s', src));
return '';
}
} else {
var p1 = parseInt(match[1], 10);
var p2 = parseInt(match[2], 10);
if ((p1 - 9) === 0 || (p2 - 9) === 0) {
if (ruleless.hasOwnProperty(src)) {
console.log(util.format('Hit in ruleless[%s]: [%s]', src, ruleless[src]));
return formatSrc(ruleless[src]);
} else {
console.log(util.format('Error, ignore ruleless src: %s', src));
return '';
}
}
var ss = util.format('%d-%d', (p1 + 1), (p2 + 1));
var thumbnailSrc = src.replace(/(\d{3,6})-(\d{1,2})/i, ss);
return formatSrc(thumbnailSrc);
}
}
function detailProc(url) {
request(url, function(err, res, body) {
console.log(util.format('Current detail: %s', url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
var $ = cheerio.load(body);
var eImg = $('div#gsImageView img').first();
var eDate = $('div#gsContent .date.summary').first();
var eTitle = $('div#gsContent h2').first();
if ((eImg.length !== 1) ||
(eDate.length !== 1) ||
(eTitle.length !== 1) ||
!eImg.attr('src') ||
!eDate.text().trim() ||
!eTitle.text().trim()) {
console.log('Invalid eImg, eDate or eTitle');
console.log(util.inspect({
'eImg': eImg,
'eDate': eDate,
'eTitle': eTitle
}, {depth: null}));
return;
}
var title = $('title').text().trim() || eTitle.text().trim();
var img = {
'src': formatSrc(eImg.attr('src')),
'alt': eImg.attr('alt') || title
};
var thumbnailUrl = getThumbnailUrl(eImg.attr('src'));
if (thumbnailUrl) {
img.thumbnail = thumbnailUrl;
}
var obj = {
'title': title,
'date': formatDate(eDate.text().trim()),
'created': new Date(),
'tags': ['gif'],
'imgs': [img]
};
console.log(util.inspect(obj));
});
}
function pageProc(url) {
request(url, function(err, res, body) {
console.log(util.format('Current page: %s', url));
if (err || res.statusCode != 200) {
console.log(util.inspect({
'err': err,
'res': res
}, {depth: null}));
return;
}
var $ = cheerio.load(body);
$('div#gsContent td.giItemCell').each(function(index, itemCell) {
var eA = $(itemCell).find('a').first();
if ((eA.length === 1) &&
eA.attr('href') &&
(eA.attr('href').indexOf('v/') !== -1)) {
var detailUrl = util.format('%s/gallery/%s', baseUrl, eA.attr('href'));
setTimeout(detailProc, 1000, detailUrl);
} else {
console.log('Invalid giItemCell');
console.log(util.inspect(itemCell, {depth: null}));
}
});
var eNext = $('div#gsContent div.next-and-last a.next').first();
if ((eNext.length === 1) &&
eNext.attr('href') &&
(eNext.attr('href').indexOf('main.php?g2_page=') !== -1)) {
var nextUrl = util.format('%s/gallery/%s', baseUrl, eNext.attr('href'));
setTimeout(pageProc, 1000, nextUrl);
} else {
console.log('Last page');
}
});
}
function main() {
pageProc(baseUrl);
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment