Instantly share code, notes, and snippets.

Embed
What would you like to do?
Code for adding Commons categories to the English Wikipedia listed building pages
<?php
require_once( '../mw-peachy/Init.php' );
set_time_limit( 0 );
$site = Peachy::newWiki( 'livingbot' );
$commons = Peachy::newWiki( null, null, null, 'https://commons.wikimedia.org/w/api.php' );
$http = new HTTP();
$template = '{{EH listed building row';
$lists = array( 'Grade I listed buildings in Suffolk Coastal' /* etc */ );
foreach( $lists as $list ){
echo "Editing $list... ";
file_put_contents( 'log.txt', "\n$list\n--------------\n", FILE_APPEND );
$list = new Page( $site, $list );
$content = $list->get_text();
$sections = explode( $template, $content );
if( count( $sections ) < 2 ) continue;
$area = extractParameterFromTemplate( 'subdivision_area', $sections[0] );
if( !$area ) continue;
foreach( $sections as &$section ){
if( extractParameterFromTemplate( 'commonscat', $section ) !== false ) continue;
$cleanName = $name = extractParameterFromTemplate( 'name', $section );
if( $name === false ) continue;
$image = extractParameterFromTemplate( 'image', $section );
$type = extractParameterFromTemplate( 'type', $section );
$location = extractParameterFromTemplate( 'location', $section );
list( $location, ) = explode( ',', $location );
$disambiguatedLocation = "$location, $area";
$newCommonsCat = false;
$existingImageExemption = false;
if( $image !== false ){
// Try to recover one from an image
$categories = getCategoriesOfPage( "File:$image" );
foreach( $categories as $category ){
if( $category == "Category:$name"
|| $category == $name
|| ( stripos( $category, $cleanName ) !== false && preg_match( "/$location(,[^)]+)?\)?$/", $category ) )
|| ( stripos( $category, "Church" ) !== false && in_array( $type, $churches ) && preg_match( "/$location(, $area)?\)?$/", $category ) ) ){
$newCommonsCat = str_replace( 'Category:', '', $category );
$existingImageExemption = true;
break;
}
}
}
$churches = array( 'Church', 'Parish Church' );
if( $newCommonsCat === false && in_array( $type, $churches ) ) {
// Churches are particularly formulaic in their category structure
// -- and well covered on Commons -- so we can try to be a bit more intelligent
$cleanName = preg_replace( '/^(Parish )?Church of /', '', $cleanName );
$cleanName = str_replace( array( 'St. ', 'Saint ' ), 'St ', $cleanName );
if( !preg_match( '/^(St|All|the) /', $cleanName ) ) continue;
$cleanName = preg_replace( '/^the /', 'Church of the ', $cleanName );
$alternatives = array(
"$cleanName, $disambiguatedLocation",
"$cleanName ($disambiguatedLocation)",
"$cleanName, $location",
"$cleanName' Church, $location",
"$cleanName's Church, $disambiguatedLocation",
"$cleanName's Church, $location"
);
foreach( $alternatives as $alternative ){
if( categoryExists( $alternative ) ){
$newCommonsCat = $alternative;
break;
}
}
}
if( $newCommonsCat === false ){
// Brute force
$alternatives = array( $cleanName, str_replace( 'St ', 'Saint ', $cleanName ) );
foreach( $alternatives as $alternative ){
$possible = $commons->allpages( 14, $alternative );
foreach( $possible as $category ){
$category = $category['title'];
if( preg_match( "/$location(,[^)]+)?\)?$/", $category ) ) {
$newCommonsCat = str_replace( 'Category:', '', $category );
break;
}
}
}
}
if( $newCommonsCat !== false ) {
if( $image === false ){
file_put_contents( 'log.txt', "Interesting, $name now has Category:$newCommonsCat but no image...\n", FILE_APPEND );
} else {
// Sanity check: existing image should have category
if( !$existingImageExemption && !pageHasCategory( "File:$image", $newCommonsCat ) ){
file_put_contents( 'log.txt', "Alas File:$image does not have proposed Category:$newCommonsCat<br>", FILE_APPEND );
continue;
}
}
$categoriesOfCategory = getCategoriesOfPage( "Category:$newCommonsCat" );
$mentionsArea = false;
foreach( $categoriesOfCategory as $category ){
if( stripos( $category, $area ) !== false ){
$mentionsArea = true;
break;
}
}
if( !$mentionsArea ) {
file_put_contents( 'log.txt', "Alas Category:$newCommonsCat does not mention area $area<br>", FILE_APPEND );
continue;
}
echo "Adding $newCommonsCat to $name<br>";
if( strpos( $section, 'commonscat' ) !== false ){
$section = preg_replace( '/(commonscat *=)/', "$1 $newCommonsCat", $section );
} else {
$section = preg_replace( '/(( *)image( *)=.*)/', "$1\n|$2".'commonscat'."$3= $newCommonsCat", $section, 1 );
}
}
}
$content = implode( $template, $sections );
$list->edit( $content, 'Try adding commonscats using a script-assisted method', false, false );
}
function extractParameterFromTemplate( $parameter, $template ){
// Trim links
$template = preg_replace( '/\[\[([^]]+\|)?/', '', $template );
$template = str_replace( ']]', '', $template );
if( !preg_match('/\| *' . $parameter . ' *= *(\[\[([^]]+\|)?)?(.*)/', $template, $matches ) ) return false;
$result = trim( array_pop( $matches ) );
return ( strlen( $result ) > 0 ) ? $result : false;
}
function categoryExists( $category ){
global $http;
$json = $http->get( 'https://commons.wikimedia.org/w/api.php?format=json&action=query&titles=Category:' . urlencode( $category ) );
$arr = json_decode( $json, true );
$page = array_pop( $arr['query']['pages'] );
return !isset( $page['missing'] );
}
function getCategoriesOfPage( $page ){
global $http, $commons;
$json = $http->get( 'https://commons.wikimedia.org/w/api.php?format=json&action=query&prop=categories&titles=' . urlencode( urldecode( $page ) ) );
$arr = json_decode( $json, true );
$page = array_pop( $arr['query']['pages'] );
if( isset( $page['missing'] ) ) return array();
$categories = array();
foreach( $page['categories'] as $pageCategory ){
$categories[] = $pageCategory['title'];
}
return $categories;
}
function pageHasCategory( $page, $category ){
$categories = getCategoriesOfPage( $page );
return in_array( "Category:$category", $categories );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment