Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@jdlrobson
Created February 23, 2016 01:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdlrobson/c12e41e348ae8729c7bd to your computer and use it in GitHub Desktop.
Save jdlrobson/c12e41e348ae8729c7bd to your computer and use it in GitHub Desktop.
var fetch = require('node-fetch');
var io = require( 'socket.io-client' );
var titles = {};
var pageImages = {};
var alternativePageImageAvailable = {};
var interval;
var socket = io.connect('stream.wikimedia.org/rc');
var param = require('node-jquery-param');
var args = process.argv.slice(2);
var fs = require('fs');
function sourceFromLiveEdits( sampleSize ) {
var collected = 0;
return new Promise( function ( resolve, reject ) {
io.connect( 'stream.wikimedia.org/rc' )
.on( 'connect', function () {
socket.emit( 'subscribe', 'en.wikipedia.org' );
})
.on( 'change', function ( data ) {
if ( data.namespace === 0 && collected < sampleSize ) {
if ( titles[data.title] === undefined ) {
titles[data.title] = '';
collected += 1;
console.log( 'Collected', data.title, collected, 'results so far' );
}
if ( collected === sampleSize ) {
resolve();
}
}
} );
} );
}
function hasAlternativePageImage( title ) {
return alternativePageImageAvailable[title];
}
function report() {
var notfree = 0, alt = 0, total = 0, noimage = 0;
for ( var title in titles ) {
if ( titles.hasOwnProperty( title ) ) {
total += 1;
if ( titles[title] === false ) {
notfree += 1;
if ( hasAlternativePageImage( title ) ) {
alt += 1;
}
} else if ( titles[title] === 'N/A' ) {
noimage += 1;
}
}
}
console.log(notfree, total, noimage, alt );
console.log( parseInt( ( noimage / total ) * 100, 10 ) + '% of titles have no page image.' );
console.log( parseInt( ( notfree / ( total - noimage ) ) * 100, 10 ) + '% of images in sample that have images are non-free images.' );
console.log( parseInt( ( alt / notfree ) * 100, 10 ) + '% of images in sample that have non-free images have a free alternative.' );
}
var api = 'https://en.wikipedia.org/w/api.php';
var commonsApi = 'https://commons.wikimedia.org/w/api.php';
function queryHasFreeImages( title ) {
console.log( 'Checking status of all images on ', title, ' page... ');
var url = api + '?' + param( {
action: 'query',
prop: 'imageinfo',
generator: 'images',
iiprop: 'extmetadata|dimensions',
iiextmetadatafilter: 'NonFree',
format: 'json',
formatversion: 2,
titles: title
} );
fetch( url ).then( function ( resp ) {
return resp.json();
} ).then( function ( json ) {
var suitableAlternative = false;
json.query.pages.forEach( function ( page ) {
var info = page.imageinfo[0];
if ( info && info.width > 270 && !info.extmetadata.length ) {
suitableAlternative = true;
}
} );
alternativePageImageAvailable[title] = suitableAlternative;
console.log( 'DATA: ', title, ' has alternative? ', suitableAlternative );
} )
}
function queryNonFreeStatus( imgs ) {
console.log( 'Querying non-free status... ');
var url = api + '?' + param( {
action: 'query',
prop: 'categories',
format: 'json',
formatversion: 2,
titles: imgs.join( '|' )
} );
console.log( url );
fetch( url ).then( function ( resp ) {
console.log( 'got response [categories]' );
return resp.json();
} ).then( function ( files ) {
var missing = [];
var denormalizedTitles = {};
console.log( 'got json [categories]' );
if ( files.query.normalized ) {
files.query.normalized.forEach( function ( data ) {
denormalizedTitles[data.to] = data.from;
} );
}
console.log( denormalizedTitles );
console.log( '$$$$$$$$ ' );
files.query.pages.forEach( function ( pageImage ) {
var pageImageTitle, title,
free = true;
if ( denormalizedTitles[pageImage.title] ) {
pageImageTitle = denormalizedTitles[pageImage.title]
} else {
pageImageTitle = pageImage.title
}
var title = pageImages[pageImageTitle];
if ( !title ) {
console.log( 'ERROR', pageImage.title, pageImageTitle, JSON.stringify( pageImages ) );
return;
}
console.log( 'Inspecting ' + pageImage.title );
if ( pageImage.missing ) {
// Assume free since it is on commons
console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is free' );
titles[title] = free;
} else if ( pageImage.categories ) {
pageImage.categories.forEach( function ( category ) {
if ( category.title === "Category:All non-free media" ) {
free = false;
}
} );
// record status
var suffix = free ? 'free' : 'not free';
console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is ' + suffix );
titles[title] = free;
} else {
// No categories. Assume free.
console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is free' );
titles[title] = true;
}
if ( !titles[title] ) {
queryHasFreeImages( title );
}
} );
} );
}
function processPageImages( json ) {
var imgs = [];
console.log( 'Got json [pageimages]' );
json.query.pages.forEach( function ( page ) {
if ( page.pageimage ) {
// prepare to work out if the page image is free.
imgs.push( 'File:' + page.pageimage );
pageImages['File:' + page.pageimage] = page.title;
} else {
console.log( 'DATA:' + page.title + ' has no page image' );
// Set as free image as there is no image.
titles[page.title] = 'N/A';
}
} );
if ( imgs.length ) {
console.log( 'querying for ' + imgs.length + ' page images non-free/free status' );
queryNonFreeStatus( imgs );
}
}
function fetchPageImages( query ) {
console.log( 'Querying ', query.length, ' entries for sample' );
var url = api + '?' + param( {
action: 'query',
prop: 'pageimages',
formatversion: 2,
format: 'json',
piprop: 'name',
pilimit: query.length,
titles: query.join( '|' )
} );
fetch( url ).then( function ( resp ) {
return resp.json();
} ).then( function ( json ) {
processPageImages( json );
} );
}
var results,
source = args[0];
function loadFile( path ) {
return new Promise( function ( resolve, reject ) {
fs.readFile( __dirname + '/' + path, function (err, data) {
console.log( 'Loaded file' );
var fileTitles = data.toString().split( '\n' );
console.log( 'Populating data structure' );
fileTitles.forEach( function ( title ) {
if ( title ) {
title = title.replace( /_/g, ' ' );
titles[title] = '';
}
} );
resolve();
} );
} );
}
if ( source === 'rcstream' ) {
results = sourceFromLiveEdits( parseInt( args[1], 10 ) || 10 );
} else if ( source === 'file' ) {
results = loadFile( 'titles.txt' );
} else {
throw Error( 'Please provide a source e.g. `node index.js rcstream`' );
}
results.then( function () {
var interval;
console.log( 'Got required sample' );
function processCollectedData( sampleSize ) {
var query = [];
for ( var title in titles ) {
if ( titles.hasOwnProperty( title ) ) {
if ( typeof titles[title] !== 'boolean' && titles[title] !== 'N/A' && query.length < 10 ) {
query.push( title );
}
}
}
if ( query.length ) {
fetchPageImages( query );
} else {
// No new ones needed to be processed so do report
clearInterval( interval );
report();
}
}
interval = setInterval( function () {
processCollectedData();
}, 5000 );
} )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment