Code for adding Commons categories to the English Wikipedia listed building pages
<?php | |
require_once( '../mw-peachy/Init.php' ); | |
set_time_limit( 0 ); | |
$site = Peachy::newWiki( 'livingbot' ); | |
$commons = Peachy::newWiki( null, null, null, 'https://commons.wikimedia.org/w/api.php' ); | |
$http = new HTTP(); | |
$template = '{{EH listed building row'; | |
$lists = array( 'Grade I listed buildings in Suffolk Coastal' /* etc */ ); | |
foreach( $lists as $list ){ | |
echo "Editing $list... "; | |
file_put_contents( 'log.txt', "\n$list\n--------------\n", FILE_APPEND ); | |
$list = new Page( $site, $list ); | |
$content = $list->get_text(); | |
$sections = explode( $template, $content ); | |
if( count( $sections ) < 2 ) continue; | |
$area = extractParameterFromTemplate( 'subdivision_area', $sections[0] ); | |
if( !$area ) continue; | |
foreach( $sections as &$section ){ | |
if( extractParameterFromTemplate( 'commonscat', $section ) !== false ) continue; | |
$cleanName = $name = extractParameterFromTemplate( 'name', $section ); | |
if( $name === false ) continue; | |
$image = extractParameterFromTemplate( 'image', $section ); | |
$type = extractParameterFromTemplate( 'type', $section ); | |
$location = extractParameterFromTemplate( 'location', $section ); | |
list( $location, ) = explode( ',', $location ); | |
$disambiguatedLocation = "$location, $area"; | |
$newCommonsCat = false; | |
$existingImageExemption = false; | |
if( $image !== false ){ | |
// Try to recover one from an image | |
$categories = getCategoriesOfPage( "File:$image" ); | |
foreach( $categories as $category ){ | |
if( $category == "Category:$name" | |
|| $category == $name | |
|| ( stripos( $category, $cleanName ) !== false && preg_match( "/$location(,[^)]+)?\)?$/", $category ) ) | |
|| ( stripos( $category, "Church" ) !== false && in_array( $type, $churches ) && preg_match( "/$location(, $area)?\)?$/", $category ) ) ){ | |
$newCommonsCat = str_replace( 'Category:', '', $category ); | |
$existingImageExemption = true; | |
break; | |
} | |
} | |
} | |
$churches = array( 'Church', 'Parish Church' ); | |
if( $newCommonsCat === false && in_array( $type, $churches ) ) { | |
// Churches are particularly formulaic in their category structure | |
// -- and well covered on Commons -- so we can try to be a bit more intelligent | |
$cleanName = preg_replace( '/^(Parish )?Church of /', '', $cleanName ); | |
$cleanName = str_replace( array( 'St. ', 'Saint ' ), 'St ', $cleanName ); | |
if( !preg_match( '/^(St|All|the) /', $cleanName ) ) continue; | |
$cleanName = preg_replace( '/^the /', 'Church of the ', $cleanName ); | |
$alternatives = array( | |
"$cleanName, $disambiguatedLocation", | |
"$cleanName ($disambiguatedLocation)", | |
"$cleanName, $location", | |
"$cleanName' Church, $location", | |
"$cleanName's Church, $disambiguatedLocation", | |
"$cleanName's Church, $location" | |
); | |
foreach( $alternatives as $alternative ){ | |
if( categoryExists( $alternative ) ){ | |
$newCommonsCat = $alternative; | |
break; | |
} | |
} | |
} | |
if( $newCommonsCat === false ){ | |
// Brute force | |
$alternatives = array( $cleanName, str_replace( 'St ', 'Saint ', $cleanName ) ); | |
foreach( $alternatives as $alternative ){ | |
$possible = $commons->allpages( 14, $alternative ); | |
foreach( $possible as $category ){ | |
$category = $category['title']; | |
if( preg_match( "/$location(,[^)]+)?\)?$/", $category ) ) { | |
$newCommonsCat = str_replace( 'Category:', '', $category ); | |
break; | |
} | |
} | |
} | |
} | |
if( $newCommonsCat !== false ) { | |
if( $image === false ){ | |
file_put_contents( 'log.txt', "Interesting, $name now has Category:$newCommonsCat but no image...\n", FILE_APPEND ); | |
} else { | |
// Sanity check: existing image should have category | |
if( !$existingImageExemption && !pageHasCategory( "File:$image", $newCommonsCat ) ){ | |
file_put_contents( 'log.txt', "Alas File:$image does not have proposed Category:$newCommonsCat<br>", FILE_APPEND ); | |
continue; | |
} | |
} | |
$categoriesOfCategory = getCategoriesOfPage( "Category:$newCommonsCat" ); | |
$mentionsArea = false; | |
foreach( $categoriesOfCategory as $category ){ | |
if( stripos( $category, $area ) !== false ){ | |
$mentionsArea = true; | |
break; | |
} | |
} | |
if( !$mentionsArea ) { | |
file_put_contents( 'log.txt', "Alas Category:$newCommonsCat does not mention area $area<br>", FILE_APPEND ); | |
continue; | |
} | |
echo "Adding $newCommonsCat to $name<br>"; | |
if( strpos( $section, 'commonscat' ) !== false ){ | |
$section = preg_replace( '/(commonscat *=)/', "$1 $newCommonsCat", $section ); | |
} else { | |
$section = preg_replace( '/(( *)image( *)=.*)/', "$1\n|$2".'commonscat'."$3= $newCommonsCat", $section, 1 ); | |
} | |
} | |
} | |
$content = implode( $template, $sections ); | |
$list->edit( $content, 'Try adding commonscats using a script-assisted method', false, false ); | |
} | |
function extractParameterFromTemplate( $parameter, $template ){ | |
// Trim links | |
$template = preg_replace( '/\[\[([^]]+\|)?/', '', $template ); | |
$template = str_replace( ']]', '', $template ); | |
if( !preg_match('/\| *' . $parameter . ' *= *(\[\[([^]]+\|)?)?(.*)/', $template, $matches ) ) return false; | |
$result = trim( array_pop( $matches ) ); | |
return ( strlen( $result ) > 0 ) ? $result : false; | |
} | |
function categoryExists( $category ){ | |
global $http; | |
$json = $http->get( 'https://commons.wikimedia.org/w/api.php?format=json&action=query&titles=Category:' . urlencode( $category ) ); | |
$arr = json_decode( $json, true ); | |
$page = array_pop( $arr['query']['pages'] ); | |
return !isset( $page['missing'] ); | |
} | |
function getCategoriesOfPage( $page ){ | |
global $http, $commons; | |
$json = $http->get( 'https://commons.wikimedia.org/w/api.php?format=json&action=query&prop=categories&titles=' . urlencode( urldecode( $page ) ) ); | |
$arr = json_decode( $json, true ); | |
$page = array_pop( $arr['query']['pages'] ); | |
if( isset( $page['missing'] ) ) return array(); | |
$categories = array(); | |
foreach( $page['categories'] as $pageCategory ){ | |
$categories[] = $pageCategory['title']; | |
} | |
return $categories; | |
} | |
function pageHasCategory( $page, $category ){ | |
$categories = getCategoriesOfPage( $page ); | |
return in_array( "Category:$category", $categories ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment