Skip to content

Instantly share code, notes, and snippets.

@bpedro
Last active January 3, 2016 22:49
Show Gist options
  • Save bpedro/8530629 to your computer and use it in GitHub Desktop.
Save bpedro/8530629 to your computer and use it in GitHub Desktop.
Crawl the Public API Directory, grab API documentation URLs and compare each documentation page with a previous saved version. This script uses import·io. Please see http://support.import.io/knowledgebase/articles/258104-integrate-import-io-with-node-js
var importio = require('import-io').client,
fs = require('fs'),
sha1 = require('sha1'),
http = require('http-get');
// importio configuration
var userGuid = 'YOUR_USER_GUID';
var apiKey = 'YOUR_API_KEY';
var io = new importio(userGuid,
apiKey,
'query.import.io');
// the page crawler guid
var pageGuid = 'A_PAGE_CRAWLER_GUID';
// the index crawler guid
var indexGuid = 'AN_INDEX_CRAWLER_GUID';
// the page index
var idx = 1;
/**
* Index callback function.
*
* Gets messages from the API when crawling the index,
* saves results and triggers the page crawler.
*/
var indexCb = function() {
var data = [];
return function(finished, msg) {
if (msg.type == 'MESSAGE' &&
msg.hasOwnProperty("data")) {
data = data.concat(msg.data.results);
}
if (finished) {
for (var i = data.length - 1; i >= 0; i--) {
io.query({
'connectorGuids': [pageGuid],
'input': {
'webpage/url': 'http://publicapis.com' +
data[i]['url/_source']
}
}, pageCb());
};
// increment page number and re-crawl the index
idx++;
io.query({
'connectorGuids': [indexGuid],
'input': {
'webpage/url': 'http://publicapis.com/apis/' +
idx
}
}, indexCb());
}
}
}
/**
* Page callback function.
*
* Gets a message from the API when crawling each page,
* saves results and calls the page hashing
* and comparison.
*/
var pageCb = function() {
var data = [];
return function(finished, msg) {
if (msg.type == 'MESSAGE' &&
msg.hasOwnProperty('data')) {
data = data.concat(msg.data.results);
}
if (finished) {
hashAndCompare(data[0].url);
done = true;
}
}
}
/**
* Hash and compare URL contents with a
* previous saved version.
*/
var hashAndCompare = function(docUrl) {
http.get(docUrl, function(err, res) {
if (!err) {
var previous = fs.readFileSync(sha1(docUrl));
if (previous != sha1(res.buffer)) {
console.log(docUrl + ' changed!');
fs.writeFileSync(sha1(docUrl),
sha1(res.buffer));
}
}
});
}
// Connect to the server
io.connect(function(connected) {
// Check connect succeeded on callback
if (!connected) {
console.error('Unable to connect');
return;
}
// Query for the publicapis index
io.query({
'connectorGuids': [indexGuid],
'input': {
'webpage/url': 'http://publicapis.com/apis/1'
}
}, indexCb());
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment