Skip to content

Instantly share code, notes, and snippets.

@rxaviers
Last active August 29, 2015 14:06
Show Gist options
  • Save rxaviers/87e089c35d46fd3a1492 to your computer and use it in GitHub Desktop.
Save rxaviers/87e089c35d46fd3a1492 to your computer and use it in GitHub Desktop.
cldr-data: npm (custom fetch) approach (See https://github.com/rxaviers/cldrjs/issues/18#issuecomment-56438757)
/**
* A simplified HTTP request client that accepts glob patterns v@VERSION
*
* Copyright 2013 Rafael Xavier de Souza
* Released under the MIT license
* https://github.com/rxaviers/request-glob/blob/master/LICENSE-MIT
*
* Date: @DATE
*/
var minimatch = require("minimatch");
var path = require("path");
var request = require("request");
var url = require("url")
function debug() {
console.log.apply(console.log, arguments);
}
function globRequest(initialUrl, globbedUrl, options, callback) {
var found = 0;
var urls = [initialUrl];
if (!callback) {
callback = options;
options = undefined;
}
function crawl(baseUrl, body) {
body.replace(/a href="([^"]*)"/g, function(match, localPath) {
var url;
// Skip "../".
if (localPath === "../") {
return;
}
url = urlJoin(baseUrl, localPath);
// Skip anything that doesn't lead to matching glob.
if (!mayMatchGlob(url)) {
return;
}
// Add url to urls.
debug("+ ", url);
urls.push(url);
});
}
function get(url) {
debug("GET", url);
var thisRequest = _request(url, options, function(error, response, body) {
if(error) {
debug("Whops, error!");
return callback(error);
}
if ((/^text\/html/).test(response.headers["content-type"])) {
debug("Crawl");
crawl(url, body);
next();
} else {
if (!matchGlob(url)) {
debug("Content skipped");
return next();
}
debug("Got content");
found++;
thisRequest.on("end", next);
return callback(error, thisRequest, response, body, urls.length);
}
});
}
function matchGlob(url) {
return minimatch(url, globbedUrl);
}
function mayMatchGlob(url) {
var _globbedUrl, minLength;
// If globbedUrl contains "**", any url up to the initial ** may match glob.
// For example: both /a/b/c and /a/b/c/d may match /a/b/**/x, because both
// match /a/b/**.
if ((/\*\*/).test(globbedUrl)) {
_globbedUrl = globbedUrl.split("**")[0] + "**";
} else {
_globbedUrl = globbedUrl;
}
// Test url and globbedUrl against each other until their minimum common
// path length are reached.
url = url.split("/");
_globbedUrl = _globbedUrl.split("/");
minLength = Math.min(url.length, _globbedUrl.length);
return minimatch(
url.slice(0, minLength).join("/"),
_globbedUrl.slice(0, minLength).join("/")
);
}
function next() {
var url = urls.shift();
if (url) {
get(url);
} else if (!found) {
return callback(new Error("Nothing found"));
}
}
function _request(url, options, callback) {
if (options) {
return request(url, options, callback);
} else {
return request(url, callback);
}
}
function urlJoin(a, b) {
a = url.parse(a);
a.pathname = path.join(a.pathname, b).replace(/\/$/, "");
return url.format(a);
}
next();
}
module.exports = globRequest;
requestGlob = require("./request-glob")
requestGlob(
// Initial url to start crawling
"http://www.unicode.org/repos/cldr-aux/json/26",
// Glob pattern to find. Examples:
// - "http://www.unicode.org/repos/cldr-aux/json/26/main/en/numbers.json"
// - "http://www.unicode.org/repos/cldr-aux/json/26/main/*/numbers.json"
// - "http://www.unicode.org/repos/cldr-aux/json/26/**/numbers.json"
"http://www.unicode.org/repos/cldr-aux/json/26/main/*/numbers.json",
// Callback function
function(error, request, response, body, remaining) {
if (error) {
return console.log("Whops", error.message);
}
console.log("=-=-=-= (", body.length, ") +", remaining);
}
);
@rxaviers
Copy link
Author

To run this POC, execute:

node test.sh

@rxaviers
Copy link
Author

Add comments here rxaviers/cldrjs#18 (comment)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment