Skip to content

Instantly share code, notes, and snippets.

@nulltask
Last active August 15, 2018 13:27
Show Gist options
  • Save nulltask/5696023 to your computer and use it in GitHub Desktop.
Save nulltask/5696023 to your computer and use it in GitHub Desktop.
mixcloud crawler
/**
* Module dependencies.
*/
var pkg = require('./package');
var debug = require('debug')('crawler');
var monk = require('monk');
var db = monk('localhost/mixcloud');
var cloudcasts = db.get('cloudcast');
var program = require('commander');
var Batch = require('batch');
var request = require('request');
var count = 0;
program
.version('0.0.0')
.option('-u --url [url]', 'mixcloud api url')
.parse(process.argv);
// API Documentation: http://www.mixcloud.com/developers/documentation/
function get(url) {
debug('get', url);
var stream = request(url, { json: true }, function(err, res) {
if (err) {
debug('get:err', err);
return setTimeout(function() { get(url); }, 10000);
}
var batch = new Batch();
var next = res.body.paging.next;
var wait = (100 + Math.random() * 100) | 0;
batch.concurrency(4);
batch.on('progress', function(e) {
debug('progress [%s] (%s/%s)', ++count, e.complete, e.total, e.value.key);
});
res.body.data.forEach(function(data) {
batch.push(function(done) {
cloudcasts
.insert(data)
.on('complete', done);
});
});
batch.end(function(err, res) {
debug('wait %sms', wait);
setTimeout(function() { get(next); }, wait);
});
});
process.stdout.write(' ');
stream.on('data', function() {
process.stdout.write('.');
});
}
get(program.url || 'http://api.mixcloud.com/new/');
{
"name": "mixcloud-crawler",
"version": "0.0.0",
"dependencies": {
"request": "~2.21.0",
"monk": "~0.7.1",
"debug": "~0.7.2",
"commander": "~1.1.1",
"batch": "~0.3.2"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment