Skip to content

Instantly share code, notes, and snippets.

@mrsinguyen
Created June 16, 2011 06:38
Show Gist options
  • Save mrsinguyen/1028786 to your computer and use it in GitHub Desktop.
Save mrsinguyen/1028786 to your computer and use it in GitHub Desktop.
Github search scraping
/*
Usage: io github <query> [language]
io github django
io github coffeescript
To limit search results to a certain language:
io github django python
To see debug info:
io --debug github django
*/
var nodeio = require('node.io'), search_url, added_additional = false;
exports.job = new nodeio.Job({max: 50, retries: 3, auto_retry: true}, {
init: function () {
var query = '', language = '';
//Parse command line args
switch (this.options.args.length) {
case 0:
console.log('node.io github <query> [language]');
process.exit();
case 2: language = this.options.args[1];
case 1: query = this.options.args[0];
}
//Build the base search URL
search_url = 'https://github.com/search?type=Repositories&language='
+ language + '&q=' + query
+ '&repo=&langOverride=&x=0&y=0&start_value=';
//The initial input is page 1 of search results
this.input = [search_url + 1];
},
run: function (search_page) {
this.getHtml(search_page, function(err, $) {
//Add additional pages of search results to the input queue (only once)
if (!added_additional) {
var page, total_pages = $('.pager_link').last().text;
for (page = 2; page < total_pages; page++) {
this.add(search_url + page);
}
added_additional = true;
}
//Scrape projects on the page and emit
var projects = [];
$('.result').each(function (listing) {
var project = {}, title, language;
title = $('h2 a', listing).fulltext;
language = $('.language', listing).fulltext;
project.author = title.substring(0, title.indexOf(" / "));
project.title = title.substring(title.indexOf(" / ") + 3);
project.link = "https://github.com" + $('h2 a', listing).attribs.href;
project.language = language.substring(1, language.length - 1);
project.description = $('.description', listing).fulltext;
projects.push(project);
});
this.emit(projects);
});
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment