Skip to content

Instantly share code, notes, and snippets.

@wildermuthn
Created June 10, 2014 22:17
Show Gist options
  • Save wildermuthn/6470c15163a1a102032a to your computer and use it in GitHub Desktop.
Save wildermuthn/6470c15163a1a102032a to your computer and use it in GitHub Desktop.
scrape
var request = require('request')
, cheerio = require('cheerio')
, async = require('async')
, format = require('util').format;
var reddits = [ 'ImagesOfSpace' ]
, concurrency = 2;
async.eachLimit(reddits, concurrency, function (reddit, next) {
var links = {};
var number = 0;
var count = 0;
var getLinks = function(after) {
number++;
var url = format('http://reddit.com/r/%s?limit=100&after=%s', reddit, after);
request(url, function (err, response, body) {
if (err) throw err;
var $ = cheerio.load(body);
var fullname = '';
$('div.thing').each(function () {
var link = $(this).find('a.title');
fullname = $(this).attr('data-fullname');
if (link.attr('href').search('imgur') != -1) {
if (link.attr('href').search('jpg') == -1) {
links[link.attr('href')] = [];
count++;
console.log('%s: %s', count, link.attr('href'));
getImages(link.attr('href'));
}
// console.log('%s: %s (%s) (%s)', count, link.text(), link.attr('href'), fullname);
}
});
if (number != 4)
getLinks(fullname);
else
console.log(links);
});
}
getLinks();
var getImages = function(link) {
request(link, function (err, response, body) {
console.log('LINK: %s', link);
var $ = cheerio.load(body);
$('div.image a img').each(function() {
var url = $(this).attr('src');
links[link].push(url);
});
});
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment