Skip to content

Instantly share code, notes, and snippets.

@neolee
Created May 17, 2013 13:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neolee/5599115 to your computer and use it in GitHub Desktop.
Save neolee/5599115 to your computer and use it in GitHub Desktop.
Using cheerio and request to scrape clojure.github.io/clojure
function runClojureOrg(redis) {
var request = require('request');
var cheerio = require('cheerio');
var urlClojureAPI = 'http://clojure.github.io/clojure/';
console.log('Start scrapping index: ' + urlClojureAPI);
redis.flushdb();
request(urlClojureAPI, function(err, resp, body) {
if (err)
throw err;
var $ = cheerio.load(body);
var namespaces = $('#namespace-entry h2');
$(namespaces).each(function(i, namespace) {
var link = $('#api-link', $(namespace));
var ns = $(namespace).text();
var url = urlClojureAPI + $(link).attr('href');
redis.zadd('namespaces', i, ns);
console.log('Start scrapping ' + ns + ': ' + url);
request(url, (function(url) {
return function(err, resp, body) {
if (err)
throw err;
var $ = cheerio.load(body);
var funcs = $('#var-entry');
$(funcs).each(function(i, func) {
// Namespace and name
var elemH2 = $(func).children('h2');
var funcId = $(elemH2).attr('id');
var sep = funcId.indexOf('/');
var ns = funcId.substring(0, sep);
var name = funcId.substring(sep+1, funcId.length);
// Usage (remove the leading 'Usage: ' string)
var elemUsage = $(func).children('#var-usage');
var usage = $(elemUsage).text();
usage = usage.substring(7, usage.length).trim().replace(/\n\s{2,}/g, '\n');
// Doc
var elemDoc = $(func).children('#var-docstr');
var doc = $(elemDoc).text().replace(/\n\n/g, '<newpara>').replace(/\n/g, ' ').replace(/\n\s{2,}/g, '\n').replace(/<newpara>/g, '\n\n');
redis.hmset(ns + ':' + name, 'name', name, 'ns', ns, 'url', url, 'usage', usage, 'doc', doc);
});
console.log('Succeed scrapped ' + url);
};
})(url));
});
console.log('Succeed scrapped ' + urlClojureAPI);
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment