Skip to content

Instantly share code, notes, and snippets.

@neolee
Created May 16, 2013 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neolee/5592124 to your computer and use it in GitHub Desktop.
Save neolee/5592124 to your computer and use it in GitHub Desktop.
function runClojureDocsOrg(redis) {
var assert = require('better-assert');
var request = require('request');
var jsdom = require('jsdom');
// URL for web scrapping
var siteUrl = 'http://clojuredocs.org';
var category = 'clojure_core';
var pageUrl = siteUrl + '/' + category;
console.log('Start scrapping ' + pageUrl);
var sharedWindow;
var scrapper = function(html, callback) {
if (!sharedWindow) {
var envJQuery = {html: html, scripts:['./lib/jquery-1.9.1.min.js']};
jsdom.env(envJQuery, function(err, window) {
if (err) throw new Error('Failed to init jsdom');
sharedWindow = window;
callback(window.jQuery);
});
}
else {
sharedWindow.document.innerHTML = html;
callback(sharedWindow.jQuery);
}
};
redis.flushdb();
// Scrapping and recording data in Redis
request(pageUrl, function(err, resp, body) {
if (err)
throw err;
scrapper(body, function($) {
$('.functions_list .functions .function a').each(function(index) {
var name = $(this).text();
var href = $(this).attr('href');
var arr = href.split("/");
// Some non-standard keys needs special operation and assertion, see http://clojuredocs.org/clojure_core
if (!name) { name = arr[3]; }
if (['..'].indexOf(name) !== -1) { console.log(name +': null data, dropped'); return true; }
assert(category == arr[1]);
assert(name.toLowerCase() == arr[3] ||
name.toLowerCase().replace('?', '_q') == arr[3] ||
name.toLowerCase().replace('.', '_dot') == arr[3] ||
name.toLowerCase().replace('/', '_') == arr[3] );
// Prepare the data and write to Redis
ns = arr[2];
url = siteUrl + escape(href);
request(url, (function(name, ns, url) {
return function(err, resp, body) {
scrapper(body, function($){
var usage = '', doc = '';
var arr = [];
$('.usage ul li').each(function() {
arr.push($(this).text());
});
usage = arr.join('\n');
$('.doc .content').each(function() {
doc = $(this).text().replace(/\s{2,}/g, ' ');
});
// Redis data: 'func:*' are hashes of functions' name/ns/url/usage/doc
// 'indices' is a zset for sorted function names
var multi = redis.multi();
if (doc == 'undefined') { console.log(name + ': Ahhh!'); }
multi.hmset('func:' + name, 'name', name, 'ns', ns, 'url', url, 'usage', usage, 'doc', doc);
multi.zadd('indices', index, name);
multi.exec();
});
};
})(name, ns, url));
});
});
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment