Last active
December 22, 2015 17:49
-
-
Save Jarry1250/6508660 to your computer and use it in GitHub Desktop.
Code for adding Commons categories to the English Wikipedia listed building pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require_once( '../mw-peachy/Init.php' ); | |
set_time_limit( 0 ); | |
$site = Peachy::newWiki( 'livingbot' ); | |
$commons = Peachy::newWiki( null, null, null, 'https://commons.wikimedia.org/w/api.php' ); | |
$http = new HTTP(); | |
$template = '{{EH listed building row'; | |
$lists = array( 'Grade I listed buildings in Suffolk Coastal' /* etc */ ); | |
foreach( $lists as $list ){ | |
echo "Editing $list... "; | |
file_put_contents( 'log.txt', "\n$list\n--------------\n", FILE_APPEND ); | |
$list = new Page( $site, $list ); | |
$content = $list->get_text(); | |
$sections = explode( $template, $content ); | |
if( count( $sections ) < 2 ) continue; | |
$area = extractParameterFromTemplate( 'subdivision_area', $sections[0] ); | |
if( !$area ) continue; | |
foreach( $sections as &$section ){ | |
if( extractParameterFromTemplate( 'commonscat', $section ) !== false ) continue; | |
$cleanName = $name = extractParameterFromTemplate( 'name', $section ); | |
if( $name === false ) continue; | |
$image = extractParameterFromTemplate( 'image', $section ); | |
$type = extractParameterFromTemplate( 'type', $section ); | |
$location = extractParameterFromTemplate( 'location', $section ); | |
list( $location, ) = explode( ',', $location ); | |
$disambiguatedLocation = "$location, $area"; | |
$newCommonsCat = false; | |
$existingImageExemption = false; | |
if( $image !== false ){ | |
// Try to recover one from an image | |
$categories = getCategoriesOfPage( "File:$image" ); | |
foreach( $categories as $category ){ | |
if( $category == "Category:$name" | |
|| $category == $name | |
|| ( stripos( $category, $cleanName ) !== false && preg_match( "/$location(,[^)]+)?\)?$/", $category ) ) | |
|| ( stripos( $category, "Church" ) !== false && in_array( $type, $churches ) && preg_match( "/$location(, $area)?\)?$/", $category ) ) ){ | |
$newCommonsCat = str_replace( 'Category:', '', $category ); | |
$existingImageExemption = true; | |
break; | |
} | |
} | |
} | |
$churches = array( 'Church', 'Parish Church' ); | |
if( $newCommonsCat === false && in_array( $type, $churches ) ) { | |
// Churches are particularly formulaic in their category structure | |
// -- and well covered on Commons -- so we can try to be a bit more intelligent | |
$cleanName = preg_replace( '/^(Parish )?Church of /', '', $cleanName ); | |
$cleanName = str_replace( array( 'St. ', 'Saint ' ), 'St ', $cleanName ); | |
if( !preg_match( '/^(St|All|the) /', $cleanName ) ) continue; | |
$cleanName = preg_replace( '/^the /', 'Church of the ', $cleanName ); | |
$alternatives = array( | |
"$cleanName, $disambiguatedLocation", | |
"$cleanName ($disambiguatedLocation)", | |
"$cleanName, $location", | |
"$cleanName' Church, $location", | |
"$cleanName's Church, $disambiguatedLocation", | |
"$cleanName's Church, $location" | |
); | |
foreach( $alternatives as $alternative ){ | |
if( categoryExists( $alternative ) ){ | |
$newCommonsCat = $alternative; | |
break; | |
} | |
} | |
} | |
if( $newCommonsCat === false ){ | |
// Brute force | |
$alternatives = array( $cleanName, str_replace( 'St ', 'Saint ', $cleanName ) ); | |
foreach( $alternatives as $alternative ){ | |
$possible = $commons->allpages( 14, $alternative ); | |
foreach( $possible as $category ){ | |
$category = $category['title']; | |
if( preg_match( "/$location(,[^)]+)?\)?$/", $category ) ) { | |
$newCommonsCat = str_replace( 'Category:', '', $category ); | |
break; | |
} | |
} | |
} | |
} | |
if( $newCommonsCat !== false ) { | |
if( $image === false ){ | |
file_put_contents( 'log.txt', "Interesting, $name now has Category:$newCommonsCat but no image...\n", FILE_APPEND ); | |
} else { | |
// Sanity check: existing image should have category | |
if( !$existingImageExemption && !pageHasCategory( "File:$image", $newCommonsCat ) ){ | |
file_put_contents( 'log.txt', "Alas File:$image does not have proposed Category:$newCommonsCat<br>", FILE_APPEND ); | |
continue; | |
} | |
} | |
$categoriesOfCategory = getCategoriesOfPage( "Category:$newCommonsCat" ); | |
$mentionsArea = false; | |
foreach( $categoriesOfCategory as $category ){ | |
if( stripos( $category, $area ) !== false ){ | |
$mentionsArea = true; | |
break; | |
} | |
} | |
if( !$mentionsArea ) { | |
file_put_contents( 'log.txt', "Alas Category:$newCommonsCat does not mention area $area<br>", FILE_APPEND ); | |
continue; | |
} | |
echo "Adding $newCommonsCat to $name<br>"; | |
if( strpos( $section, 'commonscat' ) !== false ){ | |
$section = preg_replace( '/(commonscat *=)/', "$1 $newCommonsCat", $section ); | |
} else { | |
$section = preg_replace( '/(( *)image( *)=.*)/', "$1\n|$2".'commonscat'."$3= $newCommonsCat", $section, 1 ); | |
} | |
} | |
} | |
$content = implode( $template, $sections ); | |
$list->edit( $content, 'Try adding commonscats using a script-assisted method', false, false ); | |
} | |
function extractParameterFromTemplate( $parameter, $template ){ | |
// Trim links | |
$template = preg_replace( '/\[\[([^]]+\|)?/', '', $template ); | |
$template = str_replace( ']]', '', $template ); | |
if( !preg_match('/\| *' . $parameter . ' *= *(\[\[([^]]+\|)?)?(.*)/', $template, $matches ) ) return false; | |
$result = trim( array_pop( $matches ) ); | |
return ( strlen( $result ) > 0 ) ? $result : false; | |
} | |
function categoryExists( $category ){ | |
global $http; | |
$json = $http->get( 'https://commons.wikimedia.org/w/api.php?format=json&action=query&titles=Category:' . urlencode( $category ) ); | |
$arr = json_decode( $json, true ); | |
$page = array_pop( $arr['query']['pages'] ); | |
return !isset( $page['missing'] ); | |
} | |
function getCategoriesOfPage( $page ){ | |
global $http, $commons; | |
$json = $http->get( 'https://commons.wikimedia.org/w/api.php?format=json&action=query&prop=categories&titles=' . urlencode( urldecode( $page ) ) ); | |
$arr = json_decode( $json, true ); | |
$page = array_pop( $arr['query']['pages'] ); | |
if( isset( $page['missing'] ) ) return array(); | |
$categories = array(); | |
foreach( $page['categories'] as $pageCategory ){ | |
$categories[] = $pageCategory['title']; | |
} | |
return $categories; | |
} | |
function pageHasCategory( $page, $category ){ | |
$categories = getCategoriesOfPage( $page ); | |
return in_array( "Category:$category", $categories ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment