Created
February 23, 2016 01:05
-
-
Save jdlrobson/c12e41e348ae8729c7bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fetch = require('node-fetch'); | |
var io = require( 'socket.io-client' ); | |
var titles = {}; | |
var pageImages = {}; | |
var alternativePageImageAvailable = {}; | |
var interval; | |
var socket = io.connect('stream.wikimedia.org/rc'); | |
var param = require('node-jquery-param'); | |
var args = process.argv.slice(2); | |
var fs = require('fs'); | |
function sourceFromLiveEdits( sampleSize ) { | |
var collected = 0; | |
return new Promise( function ( resolve, reject ) { | |
io.connect( 'stream.wikimedia.org/rc' ) | |
.on( 'connect', function () { | |
socket.emit( 'subscribe', 'en.wikipedia.org' ); | |
}) | |
.on( 'change', function ( data ) { | |
if ( data.namespace === 0 && collected < sampleSize ) { | |
if ( titles[data.title] === undefined ) { | |
titles[data.title] = ''; | |
collected += 1; | |
console.log( 'Collected', data.title, collected, 'results so far' ); | |
} | |
if ( collected === sampleSize ) { | |
resolve(); | |
} | |
} | |
} ); | |
} ); | |
} | |
function hasAlternativePageImage( title ) { | |
return alternativePageImageAvailable[title]; | |
} | |
function report() { | |
var notfree = 0, alt = 0, total = 0, noimage = 0; | |
for ( var title in titles ) { | |
if ( titles.hasOwnProperty( title ) ) { | |
total += 1; | |
if ( titles[title] === false ) { | |
notfree += 1; | |
if ( hasAlternativePageImage( title ) ) { | |
alt += 1; | |
} | |
} else if ( titles[title] === 'N/A' ) { | |
noimage += 1; | |
} | |
} | |
} | |
console.log(notfree, total, noimage, alt ); | |
console.log( parseInt( ( noimage / total ) * 100, 10 ) + '% of titles have no page image.' ); | |
console.log( parseInt( ( notfree / ( total - noimage ) ) * 100, 10 ) + '% of images in sample that have images are non-free images.' ); | |
console.log( parseInt( ( alt / notfree ) * 100, 10 ) + '% of images in sample that have non-free images have a free alternative.' ); | |
} | |
var api = 'https://en.wikipedia.org/w/api.php'; | |
var commonsApi = 'https://commons.wikimedia.org/w/api.php'; | |
function queryHasFreeImages( title ) { | |
console.log( 'Checking status of all images on ', title, ' page... '); | |
var url = api + '?' + param( { | |
action: 'query', | |
prop: 'imageinfo', | |
generator: 'images', | |
iiprop: 'extmetadata|dimensions', | |
iiextmetadatafilter: 'NonFree', | |
format: 'json', | |
formatversion: 2, | |
titles: title | |
} ); | |
fetch( url ).then( function ( resp ) { | |
return resp.json(); | |
} ).then( function ( json ) { | |
var suitableAlternative = false; | |
json.query.pages.forEach( function ( page ) { | |
var info = page.imageinfo[0]; | |
if ( info && info.width > 270 && !info.extmetadata.length ) { | |
suitableAlternative = true; | |
} | |
} ); | |
alternativePageImageAvailable[title] = suitableAlternative; | |
console.log( 'DATA: ', title, ' has alternative? ', suitableAlternative ); | |
} ) | |
} | |
function queryNonFreeStatus( imgs ) { | |
console.log( 'Querying non-free status... '); | |
var url = api + '?' + param( { | |
action: 'query', | |
prop: 'categories', | |
format: 'json', | |
formatversion: 2, | |
titles: imgs.join( '|' ) | |
} ); | |
console.log( url ); | |
fetch( url ).then( function ( resp ) { | |
console.log( 'got response [categories]' ); | |
return resp.json(); | |
} ).then( function ( files ) { | |
var missing = []; | |
var denormalizedTitles = {}; | |
console.log( 'got json [categories]' ); | |
if ( files.query.normalized ) { | |
files.query.normalized.forEach( function ( data ) { | |
denormalizedTitles[data.to] = data.from; | |
} ); | |
} | |
console.log( denormalizedTitles ); | |
console.log( '$$$$$$$$ ' ); | |
files.query.pages.forEach( function ( pageImage ) { | |
var pageImageTitle, title, | |
free = true; | |
if ( denormalizedTitles[pageImage.title] ) { | |
pageImageTitle = denormalizedTitles[pageImage.title] | |
} else { | |
pageImageTitle = pageImage.title | |
} | |
var title = pageImages[pageImageTitle]; | |
if ( !title ) { | |
console.log( 'ERROR', pageImage.title, pageImageTitle, JSON.stringify( pageImages ) ); | |
return; | |
} | |
console.log( 'Inspecting ' + pageImage.title ); | |
if ( pageImage.missing ) { | |
// Assume free since it is on commons | |
console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is free' ); | |
titles[title] = free; | |
} else if ( pageImage.categories ) { | |
pageImage.categories.forEach( function ( category ) { | |
if ( category.title === "Category:All non-free media" ) { | |
free = false; | |
} | |
} ); | |
// record status | |
var suffix = free ? 'free' : 'not free'; | |
console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is ' + suffix ); | |
titles[title] = free; | |
} else { | |
// No categories. Assume free. | |
console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is free' ); | |
titles[title] = true; | |
} | |
if ( !titles[title] ) { | |
queryHasFreeImages( title ); | |
} | |
} ); | |
} ); | |
} | |
function processPageImages( json ) { | |
var imgs = []; | |
console.log( 'Got json [pageimages]' ); | |
json.query.pages.forEach( function ( page ) { | |
if ( page.pageimage ) { | |
// prepare to work out if the page image is free. | |
imgs.push( 'File:' + page.pageimage ); | |
pageImages['File:' + page.pageimage] = page.title; | |
} else { | |
console.log( 'DATA:' + page.title + ' has no page image' ); | |
// Set as free image as there is no image. | |
titles[page.title] = 'N/A'; | |
} | |
} ); | |
if ( imgs.length ) { | |
console.log( 'querying for ' + imgs.length + ' page images non-free/free status' ); | |
queryNonFreeStatus( imgs ); | |
} | |
} | |
function fetchPageImages( query ) { | |
console.log( 'Querying ', query.length, ' entries for sample' ); | |
var url = api + '?' + param( { | |
action: 'query', | |
prop: 'pageimages', | |
formatversion: 2, | |
format: 'json', | |
piprop: 'name', | |
pilimit: query.length, | |
titles: query.join( '|' ) | |
} ); | |
fetch( url ).then( function ( resp ) { | |
return resp.json(); | |
} ).then( function ( json ) { | |
processPageImages( json ); | |
} ); | |
} | |
var results, | |
source = args[0]; | |
function loadFile( path ) { | |
return new Promise( function ( resolve, reject ) { | |
fs.readFile( __dirname + '/' + path, function (err, data) { | |
console.log( 'Loaded file' ); | |
var fileTitles = data.toString().split( '\n' ); | |
console.log( 'Populating data structure' ); | |
fileTitles.forEach( function ( title ) { | |
if ( title ) { | |
title = title.replace( /_/g, ' ' ); | |
titles[title] = ''; | |
} | |
} ); | |
resolve(); | |
} ); | |
} ); | |
} | |
if ( source === 'rcstream' ) { | |
results = sourceFromLiveEdits( parseInt( args[1], 10 ) || 10 ); | |
} else if ( source === 'file' ) { | |
results = loadFile( 'titles.txt' ); | |
} else { | |
throw Error( 'Please provide a source e.g. `node index.js rcstream`' ); | |
} | |
results.then( function () { | |
var interval; | |
console.log( 'Got required sample' ); | |
function processCollectedData( sampleSize ) { | |
var query = []; | |
for ( var title in titles ) { | |
if ( titles.hasOwnProperty( title ) ) { | |
if ( typeof titles[title] !== 'boolean' && titles[title] !== 'N/A' && query.length < 10 ) { | |
query.push( title ); | |
} | |
} | |
} | |
if ( query.length ) { | |
fetchPageImages( query ); | |
} else { | |
// No new ones needed to be processed so do report | |
clearInterval( interval ); | |
report(); | |
} | |
} | |
interval = setInterval( function () { | |
processCollectedData(); | |
}, 5000 ); | |
} ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment