Last active
August 29, 2015 14:03
-
-
Save huzhifeng/b8f00637fe271eecb111 to your computer and use it in GitHub Desktop.
Node.js crawler for http://forgifs.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request'); | |
var cheerio = require('cheerio'); | |
var util = require('util'); | |
var baseUrl = 'http://forgifs.com'; | |
function formatSrc(src) { | |
if (src.charAt(0) === '/') { | |
return util.format('%s%s', baseUrl, src); | |
} else if (src.indexOf('http://') === -1) { | |
return util.format('%s/%s', baseUrl, src); | |
} else { | |
return src; | |
} | |
} | |
function formatDate(d) { | |
var l = d.split(':'); // Date: 03/31/2009 | |
if (l.length === 2) { | |
return l[1].trim(); | |
} else { | |
return d; | |
} | |
} | |
// Original gif src: /gallery/d/218815-1/Lizard-vs-centipede.gif | |
// Thumbnail src: http://forgifs.com/gallery/d/218816-2/Lizard-vs-centipede.gif | |
function getThumbnailUrl(src) { | |
var pattern = /^\/gallery\/d\/(\d{3,6})-(\d{1,2})\/.*\.gif$/i; | |
var match = src.match(pattern); | |
var ruleless = { | |
// Original src: Thumbnail src | |
'/gallery/d/68120-5/Bowling-celebration-fail': '/gallery/d/68121-5/Bowling-celebration-fail', | |
'/gallery/d/162692-9/Bert-snaps.gif': '/gallery/d/162693-4/Bert-snaps.gif', | |
'/gallery/d/129655-9/News-reporter-videobombed_001.gif': '/gallery/d/129656-4/News-reporter-videobombed_001.gif', | |
'/gallery/d/66944-9/Roller-coaster-fail-trapped.gif': '/gallery/d/66945-7/Roller-coaster-fail-trapped.gif', | |
'/gallery/d/83077-9/Kid-kicking-ball-fail-QWOP.gif': '/gallery/d/83078-6/Kid-kicking-ball-fail-QWOP.gif', | |
'/gallery/d/71404-9/Lizard-attacks-reporter.gif': '/gallery/d/71405-7/Lizard-attacks-reporter.gif', | |
'/gallery/d/84325-9/Bowling-fail-sprinklers.gif': '/gallery/d/84326-6/Bowling-fail-sprinklers.gif', | |
'/gallery/d/81522-9/Boxer-vs-bullies.gif': '/gallery/d/81523-8/Boxer-vs-bullies.gif', | |
'/gallery/d/170176-9/Heaven-gates.gif': '/gallery/d/170177-4/Heaven-gates.gif', | |
'/gallery/d/165044-9/Cthulhu-shirt-shake.gif': '/gallery/d/165045-10/Cthulhu-shirt-shake.gif', | |
'/gallery/d/84702-9/Alba_finger_suck.gif': '/gallery/d/84703-4/Alba_finger_suck.gif', | |
'/gallery/d/77026-9/Horse_slides.gif': '/gallery/d/77027-5/Horse_slides.gif', | |
'/gallery/d/79735-9/Treadmill_gangsta.gif': '/gallery/d/79736-3/Treadmill_gangsta.gif' | |
}; | |
if(!match || (match.length !== 3)) { | |
if (ruleless.hasOwnProperty(src)) { | |
console.log(util.format('Hit in ruleless[%s]: [%s]', src, ruleless[src])); | |
return formatSrc(ruleless[src]); | |
} else { | |
console.log(util.format('Error, ignore unmatched src: %s', src)); | |
return ''; | |
} | |
} else { | |
var p1 = parseInt(match[1], 10); | |
var p2 = parseInt(match[2], 10); | |
if ((p1 - 9) === 0 || (p2 - 9) === 0) { | |
if (ruleless.hasOwnProperty(src)) { | |
console.log(util.format('Hit in ruleless[%s]: [%s]', src, ruleless[src])); | |
return formatSrc(ruleless[src]); | |
} else { | |
console.log(util.format('Error, ignore ruleless src: %s', src)); | |
return ''; | |
} | |
} | |
var ss = util.format('%d-%d', (p1 + 1), (p2 + 1)); | |
var thumbnailSrc = src.replace(/(\d{3,6})-(\d{1,2})/i, ss); | |
return formatSrc(thumbnailSrc); | |
} | |
} | |
function detailProc(url) { | |
request(url, function(err, res, body) { | |
console.log(util.format('Current detail: %s', url)); | |
if (err || res.statusCode != 200) { | |
console.log(util.inspect({ | |
'err': err, | |
'res': res | |
}, {depth: null})); | |
return; | |
} | |
var $ = cheerio.load(body); | |
var eImg = $('div#gsImageView img').first(); | |
var eDate = $('div#gsContent .date.summary').first(); | |
var eTitle = $('div#gsContent h2').first(); | |
if ((eImg.length !== 1) || | |
(eDate.length !== 1) || | |
(eTitle.length !== 1) || | |
!eImg.attr('src') || | |
!eDate.text().trim() || | |
!eTitle.text().trim()) { | |
console.log('Invalid eImg, eDate or eTitle'); | |
console.log(util.inspect({ | |
'eImg': eImg, | |
'eDate': eDate, | |
'eTitle': eTitle | |
}, {depth: null})); | |
return; | |
} | |
var title = $('title').text().trim() || eTitle.text().trim(); | |
var img = { | |
'src': formatSrc(eImg.attr('src')), | |
'alt': eImg.attr('alt') || title | |
}; | |
var thumbnailUrl = getThumbnailUrl(eImg.attr('src')); | |
if (thumbnailUrl) { | |
img.thumbnail = thumbnailUrl; | |
} | |
var obj = { | |
'title': title, | |
'date': formatDate(eDate.text().trim()), | |
'created': new Date(), | |
'tags': ['gif'], | |
'imgs': [img] | |
}; | |
console.log(util.inspect(obj)); | |
}); | |
} | |
function pageProc(url) { | |
request(url, function(err, res, body) { | |
console.log(util.format('Current page: %s', url)); | |
if (err || res.statusCode != 200) { | |
console.log(util.inspect({ | |
'err': err, | |
'res': res | |
}, {depth: null})); | |
return; | |
} | |
var $ = cheerio.load(body); | |
$('div#gsContent td.giItemCell').each(function(index, itemCell) { | |
var eA = $(itemCell).find('a').first(); | |
if ((eA.length === 1) && | |
eA.attr('href') && | |
(eA.attr('href').indexOf('v/') !== -1)) { | |
var detailUrl = util.format('%s/gallery/%s', baseUrl, eA.attr('href')); | |
setTimeout(detailProc, 1000, detailUrl); | |
} else { | |
console.log('Invalid giItemCell'); | |
console.log(util.inspect(itemCell, {depth: null})); | |
} | |
}); | |
var eNext = $('div#gsContent div.next-and-last a.next').first(); | |
if ((eNext.length === 1) && | |
eNext.attr('href') && | |
(eNext.attr('href').indexOf('main.php?g2_page=') !== -1)) { | |
var nextUrl = util.format('%s/gallery/%s', baseUrl, eNext.attr('href')); | |
setTimeout(pageProc, 1000, nextUrl); | |
} else { | |
console.log('Last page'); | |
} | |
}); | |
} | |
function main() { | |
pageProc(baseUrl); | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment