Skip to content

Instantly share code, notes, and snippets.

@rlemon
Last active November 13, 2015 21:37
Show Gist options
  • Save rlemon/9222f2a1bf8a55c98efc to your computer and use it in GitHub Desktop.
Save rlemon/9222f2a1bf8a55c98efc to your computer and use it in GitHub Desktop.
chat star scraper
var fs = require('fs');
var inputfile = process.argv[2];
var data = JSON.parse(fs.readFileSync(inputfile, 'utf8'));
var out = {};
data.forEach(function(row) {
if( !row.userid ) return; // user was deleted, there is no link back to their account.
var val = Number(row.count || 1);
if( row.userid in out ) {
out[row.userid] += val;
} else {
out[row.userid] = val;
}
});
var sorted = Object.keys(out).map(function(k) {
return { id: k, val: out[k] };
}).sort(function(a,b) {
return Number(a.val) > Number(b.val);
})
console.log(sorted.slice(-10));
var request = require('request'),
$ = require('cheerio');
var roomid = process.argv[2];
var outputfile = process.argv[3];
var found = [];
var page = 1;
var start = Date.now();
var siteRoot = 'http://chat.stackoverflow.com/rooms/info/' + roomid + '/';
if( !roomid || !outputfile ) {
throw new Error('you need to pass a roomid and a output file');
}
console.log('scraping room#', roomid);
scrape(siteRoot + '?tab=stars');
function scrape(url) {
console.log('scraping page ', page);
request(url, function(err, res, html) {
var root = $(html);
var newUrl = root.find('a[rel="next"]').attr('href');
var matches = [];
root.find('.monologue').each(function(i, row) {
var messageId = $(row).find('.message a').attr('name');
var messageStarCount = $(row).find('.flash .stars .times').text();
var user = $(row).find('.username').text();
var userid = $(row).find('.username a').attr('href');
matches.push({
id: messageId,
count: messageStarCount,
username: user,
userid: userid
});
});
found = found.concat(matches);
if (newUrl) {
page++;
console.log('total time: ', (Date.now() - start) / 1000, 'seconds');
scrape(siteRoot + newUrl);
} else {
finish();
}
});
}
function finish() {
var ws = require('fs').createWriteStream(outputfile);
ws.on('error', function(err) {
console.error(err);
});
ws.write(JSON.stringify(found));
ws.end();
console.log('found stars written to ', outputfile);
}
@benjamingr
Copy link

let request = require("request-promise");
let Promise = require("bluebird");
let url = "http://chat.stackoverflow.com/rooms/info/17?tab=stars&page=";

let scrape = n => request(url + n).then($).then(root => 
    root.find(".monologue").get().map($).map($ => {
            const id = $.find('.message a').attr('name');
            const count = $.find('.flash .stars .times').text();
            const username = $.find('.username').text();
            const userid = $.find('.username a').attr('href');
            return { id:id, count: count, username: username, userid: userid };
    });
);

Promise.map(range(1, 476), scrape, {concurrency: 16}).then(results => {
    // flatmap and do whatever
        // write to output
});

@rlemon
Copy link
Author

rlemon commented Nov 13, 2015

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment