Skip to content

Instantly share code, notes, and snippets.

@tomayac
Created June 20, 2015 17:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tomayac/242a3f48ca84c6e21ce3 to your computer and use it in GitHub Desktop.
Save tomayac/242a3f48ca84c6e21ce3 to your computer and use it in GitHub Desktop.
'use strict';
var async = require('async');
var request = require('request');
var ExpontentialSmoothingStream = require('exponential-smoothing-stream');
var numbers = require('numbers');
var geolib = require('geolib');
var util = require ('./util.js');
var REDIRECTS_URL = 'http://{{LANGUAGE}}.wikipedia.org/w/api.php?action=query' +
'&blnamespace=0&list=backlinks&blfilterredir=redirects&bllimit=max&' +
'format=json&bltitle=';
var LANGUAGE_LINKS_URL = '.wikipedia.org/w/api.php?action=query&' +
'prop=langlinks&format=json&lllimit=max&titles=';
var GEO_COORDINATES_URL = '.wikipedia.org/w/api.php?action=query&' +
'prop=coordinates&format=json&colimit=max&coprop&coprimary=primary&titles=';
var REVISIONS_URL = '.wikipedia.org/w/api.php?action=query&format=json' +
'&rvstart={{rvstart}}&prop=revisions&rvprop=timestamp|user&rvlimit=max' +
'&rvdir=newer&titles=';
var USER_AGENT =
'Disaster Monitor * Contact: Thomas Steiner (tomac@google.com)';
var HEADERS = { 'User-Agent': USER_AGENT };
var PARALLEL_LIMIT = 5;
var wikipedia = {
getRevisions: function(language, article, callback) {
console.log('Getting revisions of ' + language + ':' + article + '.');
var url = 'http://' + language + LANGUAGE_LINKS_URL +
encodeURIComponent(article);
var options = {
url: url,
headers: HEADERS,
timeout: 5000
};
request.get(options, function(err, response, body) {
if (err || response.statusCode !== 200) {
return callback(err || 'Internal ServerError');
}
var data = JSON.parse(body);
if (!data.query || !data.query.pages) {
return callback(err || 'Internal Server Error');
}
var pageId = Object.keys(data.query.pages)[0];
if (!data.query.pages[pageId].langlinks) {
return callback('File Not Found');
}
data.query.pages[pageId].langlinks.push({
lang: language,
'*': article
});
var functions = {};
var yesterday = new Date(Date.now() - (24 * 60 * 60 * 1000)).toISOString();
data.query.pages[pageId].langlinks.forEach(function(langLink) {
var title = langLink.lang + ':' + decodeURIComponent(langLink['*']);
functions[title] = function(innerCallback) {
var innerOptions = {
url: 'http://' + langLink.lang + REVISIONS_URL
.replace(/\{\{rvstart\}\}/, yesterday) +
encodeURIComponent(langLink['*']),
headers: HEADERS,
timeout: 5000
};
request.get(innerOptions, function(err, response, body) {
if (err || response.statusCode !== 200) {
return innerCallback(err || 'Error ' + response.statusCode);
}
var innerData = JSON.parse(body);
if (!innerData.query || !innerData.query.pages) {
return innerCallback(null, []);
}
var pageId = Object.keys(innerData.query.pages)[0];
if (!innerData.query.pages[pageId].revisions) {
return innerCallback(null, []);
}
var revisions = [];
innerData.query.pages[pageId].revisions.forEach(function(revision, i) {
revisions[i] = {
user: revision.user,
timestamp: new Date(revision.timestamp).getTime(),
date: revision.timestamp,
article: title
};
});
return innerCallback(null, revisions);
});
};
});
async.parallelLimit(
functions,
PARALLEL_LIMIT,
function(err, results) {
if (err) {
return callback('Internal Server Error');
}
var revisions = [];
for (article in results) {
var revision = results[article];
revisions = revisions.concat(revision);
}
revisions.sort(function(a, b) {
return b.timestamp - a.timestamp;
});
var intervals = [];
var ess = new ExpontentialSmoothingStream({ smoothingFactor: 0.5 });
ess.on('data', function(data) {
intervals.push(data);
});
ess.on('end', function() {
var standardDeviation = numbers.statistic.standardDev(intervals);
var spiking = false;
if ((intervals.length >= 5) &&
(intervals[intervals.length - 1] < standardDeviation / 2)) {
spiking = true;
}
return callback(null, {
revisions: revisions,
intervals: intervals,
spiking: spiking
});
});
revisions.forEach(function(revision, i) {
if (i > 0) {
ess.write(parseInt(revisions[i - 1].timestamp, 10) -
parseInt(revision.timestamp, 10));
}
});
ess.end();
}
);
});
},
getGeolocation: function(language, article, callback) {
console.log('Geo-referencing ' + language + ':' + article + '.');
var url = 'http://' + language + LANGUAGE_LINKS_URL +
encodeURIComponent(article);
var options = {
url: url,
headers: HEADERS,
timeout: 5000
};
console.log('Geo-referencing ' + language + ':' + article + ': ' + url)
request.get(options, function(err, response, body) {
console.log('Received language links ' + url);
console.log('url '+options.url)
console.log('2 err '+err)
if (response) console.log('status ' +response.statusCode)
console.log('body ' +body)
if (err || response.statusCode !== 200) {
return callback('Internal Server Error');
}
var data = JSON.parse(body);
if (!data.query || !data.query.pages) {
return callback('Internal Server Error');
}
var pageId = Object.keys(data.query.pages)[0];
if (!data.query.pages[pageId].langlinks) {
return callback('File Not Found');
}
var functions = {};
data.query.pages[pageId].langlinks.push({
lang: language,
'*': article
});
data.query.pages[pageId].langlinks.forEach(function(langLink) {
var title = langLink.lang + ':' + decodeURIComponent(langLink['*']);
functions[title] = function(innerCallback) {
var innerOptions = {
url: 'http://' + langLink.lang + GEO_COORDINATES_URL + langLink['*'],
headers: HEADERS,
timeout: 5000
};
console.log('Geo coordinates ' + title + ' => ' + innerOptions.url);
request.get(innerOptions, function(err, response, body) {
console.log('Received geo coordinates ' + innerOptions.url);
console.log(innerOptions.url)
console.log('3 err '+err)
if (response) console.log('status ' +response.statusCode)
console.log('body ' +body)
if (err || response.statusCode !== 200) {
return innerCallback(err || 'Error ' + response.statusCode);
}
var innerData = JSON.parse(body);
if (!innerData.query || !innerData.query.pages) {
return innerCallback(null, []);
}
var pageId = Object.keys(innerData.query.pages)[0];
if (!innerData.query.pages[pageId].coordinates) {
return innerCallback(null, []);
}
var coordinates = {};
innerData.query.pages[pageId].coordinates.forEach(function(geo) {
// O(1) coordinates deduplication
coordinates[geo.lat + '|' + geo.lon] = true;
});
return innerCallback(null, Object.keys(coordinates).map(function(geo) {
var coords = geo.split('|');
return {
lat: coords[0],
lon: coords[1]
};
}));
});
};
});
async.parallelLimit(
functions,
PARALLEL_LIMIT,
function(err, results) {
if (err) {
return callback('Internal Server Error');
}
var coordinates = {};
for (var article in results) {
var geoArray = results[article];
geoArray.forEach(function(geo) {
// O(1) coordinates deduplication
coordinates[geo.lat + '|' + geo.lon] = true;
});
}
coordinates = Object.keys(coordinates).map(function(geo) {
geo = geo.split('|');
return {
lat: parseFloat(geo[0]),
lon: parseFloat(geo[1]),
map: util.createGoogleMapsUrl(geo[0], geo[1])
};
});
var averageCoordinates = {};
if (coordinates.length) {
var geocoordinates = [];
coordinates.forEach(function(coordinate, i) {
geocoordinates[i] = {
latitude: coordinate.lat,
longitude: coordinate.lon
};
});
var avg = geolib.getCenter(geocoordinates);
averageCoordinates.lat = avg.latitude;
averageCoordinates.lon = avg.longitude;
averageCoordinates.map = util.createGoogleMapsUrl(
averageCoordinates.lat, averageCoordinates.lon);
}
coordinates = {
individualCoordinates: coordinates,
averageCoordinates: averageCoordinates,
};
return callback(null, coordinates);
}
);
});
},
getRedirects: function(language, article, callback) {
console.log('Getting redirects for ' + language + ':' + article + '.');
var options = {
url: REDIRECTS_URL.replace(/\{\{LANGUAGE\}\}/, language) + article
.replace(/\s/g, '_'),
headers: HEADERS,
timeout: 5000
};
request.get(options, function(err, response, body) {
if (err || response.statusCode !== 200) {
return callback('Internal Server Error');
}
var data = JSON.parse(body);
var results = [article];
if ((data.query && data.query.backlinks) &&
(Array.isArray(data.query.backlinks)) &&
(data.query.backlinks.length)) {
var backlinks = data.query.backlinks;
backlinks.forEach(function(backlink, i) {
results[i + 1] = backlink.title;
});
}
return callback(null, results);
});
}
};
module.exports = wikipedia;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment