Skip to content

Instantly share code, notes, and snippets.

@ncla
Created September 30, 2016 18:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ncla/bc17e6c6e6efe09d0dd8149015eed426 to your computer and use it in GitHub Desktop.
Save ncla/bc17e6c6e6efe09d0dd8149015eed426 to your computer and use it in GitHub Desktop.
Muse.mu image scraper
var request = require('request');
var cheerio = require('cheerio');
var url = require('url');
var async = require('async');
var _ = require('underscore');
var fs = require('fs');
function parseGalleriesList(htmlString) {
$ = cheerio.load(htmlString);
var albumUrls = [];
$('.photoList li').each(function (i, v) {
var theUrl = $(v).find('.thumb > a').attr('href');
albumUrls.push('http://muse.mu' + theUrl);
});
return albumUrls;
}
function parseGalleriesImageList(htmlString) {
$ = cheerio.load(htmlString);
var imageUrls = [];
$('.photoList li').each(function (i, v) {
var theUrl = $(v).find('.thumb > a').attr('href');
imageUrls.push('http://muse.mu' + theUrl);
});
var nextPageLink = $('a[href][title="Next page"]').eq(0);
var returnacion = {
'imageUrls': imageUrls,
'isLastPage': !(nextPageLink.length > 0),
'nextPageUrl': false
};
if (nextPageLink.length) {
returnacion['nextPageUrl'] = 'http://muse.mu' + nextPageLink.attr('href');
}
return returnacion;
}
function parseImagePage(htmlString) {
$ = cheerio.load(htmlString);
return $('.mainImg img').attr('src');
}
var totalImages = 0;
var totalImageList = 0;
var totalImageListExtra = 0;
var totalImageListFull = 0;
var totalImageListLessThanThree = 0;
var totalImageListNone = 0;
var imagePages = [];
var q = async.queue(function (scrapeInfo, queueCallback) {
async.retry({times: 5, interval: 500}, function loadPage(retryCallback) {
if (scrapeInfo.type !== 'image') {
request(scrapeInfo.url, {}, function (error, response, body) {
if (error) {
retryCallback(error);
}
retryCallback(null, {reqResponse: response, details: scrapeInfo});
});
} else {
request(scrapeInfo.url)
//.on('error', function(err) {
// console.log(err);
//})
.pipe(fs.createWriteStream('images/' + scrapeInfo.url.split('/').pop()))
.on('close', function() {
retryCallback(null, {details: scrapeInfo});
});
}
}, function (err, result) {
if (err) {
q.kill();
console.log("An error occoured while loading page #" + scrapeInfo + ", aborting...");
return;
}
if (result.details.type === 'image') {
queueCallback();
return;
}
console.log(result.details);
console.log(result.reqResponse.request.uri.href + ' => ' + result.reqResponse.statusCode);
if (result.details.type === 'galleryList') {
var albumUrls = parseGalleriesList(result.reqResponse.body);
_.each(albumUrls, function(albumUrl) {
q.push({
'type': 'imageList',
'url': albumUrl
});
//totalImageList++;
});
}
if (result.details.type === 'imageList') {
var parsed = parseGalleriesImageList(result.reqResponse.body);
totalImageList++;
if (parsed.nextPageUrl !== false) {
q.push({
'type': 'imageList',
'url': parsed.nextPageUrl
});
totalImageListExtra++;
}
_.each(parsed.imageUrls, function(url) {
q.push({
'type': 'imagePage',
'url': url
});
});
imagePages = _.union(imagePages, parsed.imageUrls);
totalImages = totalImages + parsed.imageUrls.length;
if (parsed.imageUrls.length === 12) {
totalImageListFull++;
}
if (parsed.imageUrls.length < 3) {
totalImageListLessThanThree++;
}
if (parsed.imageUrls.length === 0) {
totalImageListNone++;
}
}
if (result.details.type === 'imagePage') {
var parsed = parseImagePage(result.reqResponse.body);
q.push({
'type': 'image',
'url': parsed
});
}
queueCallback();
});
}, 50);
q.drain = function () {
console.log('Total image count ' + totalImages + ', Total image lists ' + totalImageList + ', Total image lists extra ' + totalImageListExtra
+ ' Full pages ' + totalImageListFull + ' Less than 3 ' + totalImageListLessThanThree + ' Zero ' + totalImageListNone);
console.log(imagePages.length);
};
_.each(_.range(1, 14), function(page) {
q.push({
'type': 'galleryList',
'url': 'http://muse.mu/images.htm?pg=' + page
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment