jdlrobson/nonfree.js

## nonfree.js
var fetch = require('node-fetch');
var io = require( 'socket.io-client' );
var titles = {};
var pageImages = {};
var alternativePageImageAvailable = {};
var interval;
var socket = io.connect('stream.wikimedia.org/rc');
var param = require('node-jquery-param');
var args = process.argv.slice(2);
var fs = require('fs');

function sourceFromLiveEdits( sampleSize ) {
	var collected = 0;
	return new Promise( function ( resolve, reject ) {
		io.connect( 'stream.wikimedia.org/rc' )
			.on( 'connect', function () {
				socket.emit( 'subscribe', 'en.wikipedia.org' );
			})
			.on( 'change', function ( data ) {
				if ( data.namespace === 0 && collected < sampleSize ) {
					if ( titles[data.title] === undefined ) {
						titles[data.title] = '';
						collected += 1;
						console.log( 'Collected', data.title, collected, 'results so far' );
					}
					if ( collected === sampleSize ) {
						resolve();
					}
				}
			} );
	} );
}

function hasAlternativePageImage( title ) {
	return alternativePageImageAvailable[title];
}

function report() {
	var notfree = 0, alt = 0, total = 0, noimage = 0;
	for ( var title in titles ) {
		if ( titles.hasOwnProperty( title ) ) {
			total += 1;
			if ( titles[title] === false ) {
				notfree += 1;
				if ( hasAlternativePageImage( title ) ) {
					alt += 1;
				}
			} else if ( titles[title] === 'N/A' ) {
				noimage += 1;
			}
		}
	}
	console.log(notfree, total, noimage, alt );
	console.log( parseInt( ( noimage / total ) * 100, 10 ) + '% of titles have no page image.' );
	console.log( parseInt( ( notfree / ( total - noimage ) ) * 100, 10 ) + '% of images in sample that have images are non-free images.' );
	console.log( parseInt( ( alt / notfree ) * 100, 10 ) + '% of images in sample that have non-free images have a free alternative.' );
}

var api = 'https://en.wikipedia.org/w/api.php';
var commonsApi = 'https://commons.wikimedia.org/w/api.php';

function queryHasFreeImages( title ) {
	console.log( 'Checking status of all images on ', title, ' page... ');
	var url = api + '?' + param( {
		action: 'query',
		prop: 'imageinfo',
		generator: 'images',
		iiprop: 'extmetadata|dimensions',
		iiextmetadatafilter: 'NonFree',
		format: 'json',
		formatversion: 2,
		titles: title
	} );
	fetch( url ).then( function ( resp ) {
		return resp.json();
	} ).then( function ( json ) {
		var suitableAlternative = false;
		json.query.pages.forEach( function ( page ) {
			var info = page.imageinfo[0];
			if ( info && info.width > 270  && !info.extmetadata.length ) {
				suitableAlternative = true;
			}
		} );
		alternativePageImageAvailable[title] = suitableAlternative;
		console.log( 'DATA: ', title, ' has alternative? ', suitableAlternative );
	} )
}

function queryNonFreeStatus( imgs ) {
	console.log( 'Querying non-free status... ');

	var url = api + '?' + param( {
		action: 'query',
		prop: 'categories',
		format: 'json',
		formatversion: 2,
		titles: imgs.join( '|' )
	} );
	console.log( url );
	fetch( url ).then( function ( resp ) {
		console.log( 'got response [categories]' );
		return resp.json();
	} ).then( function ( files ) {
		var missing = [];
		var denormalizedTitles = {};

		console.log( 'got json [categories]' );
		if ( files.query.normalized ) {
			files.query.normalized.forEach( function ( data ) {
				denormalizedTitles[data.to] = data.from;
			} );
		}
		console.log( denormalizedTitles );
		console.log( '$$$$$$$$ ' );
		files.query.pages.forEach( function ( pageImage ) {
			var pageImageTitle, title,
				free = true;
			if ( denormalizedTitles[pageImage.title] ) {
				pageImageTitle = denormalizedTitles[pageImage.title]
			} else {
				pageImageTitle = pageImage.title
			}
			var title = pageImages[pageImageTitle];
			if ( !title ) {
				console.log( 'ERROR', pageImage.title, pageImageTitle, JSON.stringify( pageImages ) );
				return;
			}

			console.log( 'Inspecting ' + pageImage.title );
			if ( pageImage.missing ) {
				// Assume free since it is on commons
				console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is free' );
				titles[title] = free;
			} else if ( pageImage.categories ) {
				pageImage.categories.forEach( function ( category ) {
					if ( category.title === "Category:All non-free media" ) {
						free = false;
					}
				} );
				// record status
				var suffix = free ? 'free' : 'not free';
				console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is ' + suffix );
				titles[title] = free;
			} else {
				// No categories. Assume free.
				console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is free' );
				titles[title] = true;
			}
			if ( !titles[title] ) {
				queryHasFreeImages( title );
			}
		} );
	} );
}

function processPageImages( json ) {
	var imgs = [];
	console.log( 'Got json [pageimages]' );
	json.query.pages.forEach( function ( page ) {
		if ( page.pageimage ) {
			// prepare to work out if the page image is free.
			imgs.push( 'File:' + page.pageimage );
			pageImages['File:' + page.pageimage] = page.title;
		} else {
			console.log( 'DATA:' + page.title + ' has no page image' );
			// Set as free image as there is no image.
			titles[page.title] = 'N/A';
		}
	} );

	if ( imgs.length ) {
		console.log( 'querying for ' + imgs.length + ' page images non-free/free status' );
		queryNonFreeStatus( imgs );
	}
}

function fetchPageImages( query ) {
	console.log( 'Querying ', query.length, ' entries for sample' );
	var url = api + '?' + param( {
		action: 'query',
		prop: 'pageimages',
		formatversion: 2,
		format: 'json',
		piprop: 'name',
		pilimit: query.length,
		titles: query.join( '|' )
	} );
	fetch( url ).then( function ( resp ) {
		return resp.json();
	} ).then( function ( json ) {
		processPageImages( json );
	} );
}

var results,
	source = args[0];

function loadFile( path ) {
	return new Promise( function ( resolve, reject ) {
		fs.readFile( __dirname + '/' + path, function (err, data) {
			console.log( 'Loaded file' );
			var fileTitles = data.toString().split( '\n' );
			console.log( 'Populating data structure' );
			fileTitles.forEach( function ( title ) {
				if ( title ) {
					title = title.replace( /_/g, ' ' );
					titles[title] = '';
				}
			} );
			resolve();
		} );
	} );
}

if ( source === 'rcstream' ) {
	results = sourceFromLiveEdits( parseInt( args[1], 10 ) || 10 );
} else if ( source === 'file' ) {
	results = loadFile( 'titles.txt' );
} else {
	throw Error( 'Please provide a source e.g. `node index.js rcstream`' );
}
results.then( function () {
	var interval;
	console.log( 'Got required sample' );

	function processCollectedData( sampleSize ) {
		var query = [];
		for ( var title in titles ) {
			if ( titles.hasOwnProperty( title ) ) {
				if ( typeof titles[title] !== 'boolean' && titles[title] !== 'N/A' && query.length < 10 ) {
					query.push( title );
				}
			}
		}

		if ( query.length ) {
			fetchPageImages( query );
		} else {
			// No new ones needed to be processed so do report
			clearInterval( interval );
			report();
		}
	}

	interval = setInterval( function () {
		processCollectedData();
	}, 5000 );
} )
	var fetch = require('node-fetch');
	var io = require( 'socket.io-client' );
	var titles = {};
	var pageImages = {};
	var alternativePageImageAvailable = {};
	var interval;
	var socket = io.connect('stream.wikimedia.org/rc');
	var param = require('node-jquery-param');
	var args = process.argv.slice(2);
	var fs = require('fs');

	function sourceFromLiveEdits( sampleSize ) {
	var collected = 0;
	return new Promise( function ( resolve, reject ) {
	io.connect( 'stream.wikimedia.org/rc' )
	.on( 'connect', function () {
	socket.emit( 'subscribe', 'en.wikipedia.org' );
	})
	.on( 'change', function ( data ) {
	if ( data.namespace === 0 && collected < sampleSize ) {
	if ( titles[data.title] === undefined ) {
	titles[data.title] = '';
	collected += 1;
	console.log( 'Collected', data.title, collected, 'results so far' );
	}
	if ( collected === sampleSize ) {
	resolve();
	}
	}
	} );
	} );
	}

	function hasAlternativePageImage( title ) {
	return alternativePageImageAvailable[title];
	}

	function report() {
	var notfree = 0, alt = 0, total = 0, noimage = 0;
	for ( var title in titles ) {
	if ( titles.hasOwnProperty( title ) ) {
	total += 1;
	if ( titles[title] === false ) {
	notfree += 1;
	if ( hasAlternativePageImage( title ) ) {
	alt += 1;
	}
	} else if ( titles[title] === 'N/A' ) {
	noimage += 1;
	}
	}
	}
	console.log(notfree, total, noimage, alt );
	console.log( parseInt( ( noimage / total ) * 100, 10 ) + '% of titles have no page image.' );
	console.log( parseInt( ( notfree / ( total - noimage ) ) * 100, 10 ) + '% of images in sample that have images are non-free images.' );
	console.log( parseInt( ( alt / notfree ) * 100, 10 ) + '% of images in sample that have non-free images have a free alternative.' );
	}

	var api = 'https://en.wikipedia.org/w/api.php';
	var commonsApi = 'https://commons.wikimedia.org/w/api.php';

	function queryHasFreeImages( title ) {
	console.log( 'Checking status of all images on ', title, ' page... ');
	var url = api + '?' + param( {
	action: 'query',
	prop: 'imageinfo',
	generator: 'images',
	iiprop: 'extmetadata\|dimensions',
	iiextmetadatafilter: 'NonFree',
	format: 'json',
	formatversion: 2,
	titles: title
	} );
	fetch( url ).then( function ( resp ) {
	return resp.json();
	} ).then( function ( json ) {
	var suitableAlternative = false;
	json.query.pages.forEach( function ( page ) {
	var info = page.imageinfo[0];
	if ( info && info.width > 270 && !info.extmetadata.length ) {
	suitableAlternative = true;
	}
	} );
	alternativePageImageAvailable[title] = suitableAlternative;
	console.log( 'DATA: ', title, ' has alternative? ', suitableAlternative );
	} )
	}

	function queryNonFreeStatus( imgs ) {
	console.log( 'Querying non-free status... ');

	var url = api + '?' + param( {
	action: 'query',
	prop: 'categories',
	format: 'json',
	formatversion: 2,
	titles: imgs.join( '\|' )
	} );
	console.log( url );
	fetch( url ).then( function ( resp ) {
	console.log( 'got response [categories]' );
	return resp.json();
	} ).then( function ( files ) {
	var missing = [];
	var denormalizedTitles = {};

	console.log( 'got json [categories]' );
	if ( files.query.normalized ) {
	files.query.normalized.forEach( function ( data ) {
	denormalizedTitles[data.to] = data.from;
	} );
	}
	console.log( denormalizedTitles );
	console.log( '$$$$$$$$ ' );
	files.query.pages.forEach( function ( pageImage ) {
	var pageImageTitle, title,
	free = true;
	if ( denormalizedTitles[pageImage.title] ) {
	pageImageTitle = denormalizedTitles[pageImage.title]
	} else {
	pageImageTitle = pageImage.title
	}
	var title = pageImages[pageImageTitle];
	if ( !title ) {
	console.log( 'ERROR', pageImage.title, pageImageTitle, JSON.stringify( pageImages ) );
	return;
	}

	console.log( 'Inspecting ' + pageImage.title );
	if ( pageImage.missing ) {
	// Assume free since it is on commons
	console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is free' );
	titles[title] = free;
	} else if ( pageImage.categories ) {
	pageImage.categories.forEach( function ( category ) {
	if ( category.title === "Category:All non-free media" ) {
	free = false;
	}
	} );
	// record status
	var suffix = free ? 'free' : 'not free';
	console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is ' + suffix );
	titles[title] = free;
	} else {
	// No categories. Assume free.
	console.log( 'DATA:' + pageImage.title + ' used on ' + title + ' is free' );
	titles[title] = true;
	}
	if ( !titles[title] ) {
	queryHasFreeImages( title );
	}
	} );
	} );
	}

	function processPageImages( json ) {
	var imgs = [];
	console.log( 'Got json [pageimages]' );
	json.query.pages.forEach( function ( page ) {
	if ( page.pageimage ) {
	// prepare to work out if the page image is free.
	imgs.push( 'File:' + page.pageimage );
	pageImages['File:' + page.pageimage] = page.title;
	} else {
	console.log( 'DATA:' + page.title + ' has no page image' );
	// Set as free image as there is no image.
	titles[page.title] = 'N/A';
	}
	} );

	if ( imgs.length ) {
	console.log( 'querying for ' + imgs.length + ' page images non-free/free status' );
	queryNonFreeStatus( imgs );
	}
	}

	function fetchPageImages( query ) {
	console.log( 'Querying ', query.length, ' entries for sample' );
	var url = api + '?' + param( {
	action: 'query',
	prop: 'pageimages',
	formatversion: 2,
	format: 'json',
	piprop: 'name',
	pilimit: query.length,
	titles: query.join( '\|' )
	} );
	fetch( url ).then( function ( resp ) {
	return resp.json();
	} ).then( function ( json ) {
	processPageImages( json );
	} );
	}

	var results,
	source = args[0];

	function loadFile( path ) {
	return new Promise( function ( resolve, reject ) {
	fs.readFile( __dirname + '/' + path, function (err, data) {
	console.log( 'Loaded file' );
	var fileTitles = data.toString().split( '\n' );
	console.log( 'Populating data structure' );
	fileTitles.forEach( function ( title ) {
	if ( title ) {
	title = title.replace( /_/g, ' ' );
	titles[title] = '';
	}
	} );
	resolve();
	} );
	} );
	}

	if ( source === 'rcstream' ) {
	results = sourceFromLiveEdits( parseInt( args[1], 10 ) \|\| 10 );
	} else if ( source === 'file' ) {
	results = loadFile( 'titles.txt' );
	} else {
	throw Error( 'Please provide a source e.g. `node index.js rcstream`' );
	}
	results.then( function () {
	var interval;
	console.log( 'Got required sample' );

	function processCollectedData( sampleSize ) {
	var query = [];
	for ( var title in titles ) {
	if ( titles.hasOwnProperty( title ) ) {
	if ( typeof titles[title] !== 'boolean' && titles[title] !== 'N/A' && query.length < 10 ) {
	query.push( title );
	}
	}
	}

	if ( query.length ) {
	fetchPageImages( query );
	} else {
	// No new ones needed to be processed so do report
	clearInterval( interval );
	report();
	}
	}

	interval = setInterval( function () {
	processCollectedData();
	}, 5000 );
	} )