Skip to content

Instantly share code, notes, and snippets.

@Jarry1250
Last active December 23, 2015 07:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Jarry1250/6601404 to your computer and use it in GitHub Desktop.
Save Jarry1250/6601404 to your computer and use it in GitHub Desktop.
Wikipedia bot to swap in address lines for name placeholders
<?php
require_once( '../mw-peachy/Init.php' );
set_time_limit( 0 );
$site = Peachy::newWiki( 'livingbot' );
$template = 'EH listed building row';
$mysqli = new mysqli( "localhost", 'root' );
/* check connection */
if ( mysqli_connect_errno() ) {
printf( "Connect failed: %s\n", mysqli_connect_error() );
die();
}
$edits = array();
$mysqli->select_db( 'wlm' );
$result = $mysqli->query("SELECT `uid`,`name`,`Address` FROM `data-final` INNER JOIN `data-orig` ON Asset_number=uid WHERE `record_name` LIKE 'No name%'");
while( $row = $result->fetch_assoc() ){
$edits[ $row['uid'] ] = array(
'from' => $row['name'],
'to' => cleanup( $row['Address'] )
);
}
$lists = array();
$pages = $site->allpages( 0, 'Grade I', null, 'all', null, null, array(), array(), 'ascending', 'all', 500 );
foreach( $pages as $page ){
$lists[] = $page['title'];
}
$lists = array_intersect( $lists, $site->embeddedin( "Template:$template", 0, 500 ) );
foreach( $lists as $list ){
echo "Trying $list... \n";
$list = new Page( $site, $list );
$content = $list->get_text();
$sections = explode( '{{'.$template, $content );
$changes = false;
foreach( $sections as &$section ){
$uid = extractParameterFromTemplate( 'uid', $section );
if( $uid === false || !isset( $edits[$uid] ) ) continue;
$name = extractParameterFromTemplate( 'name', $section );
$from = $edits[$uid]['from'];
$to = $edits[$uid]['to'];
if( $name !== $from ) continue; // a human has tweaked this, don't overwrite
if( preg_match( '/ [NSEWC][NSEWC]?[0-9]+$/', $to, $matches ) ) {
// Move postcode
$to = str_replace( $matches[0], '', $to );
$section = preg_replace( "/(\| *location *= *[^|\n]+)/", "$1".$matches[0], $section );
}
$section = preg_replace( "/(\| *name *= *)$from/", "$1".$to, $section );
$changes = true;
}
if( !$changes ) continue;
$content = implode( '{{'.$template, $content );
$list->edit( $content, 'Bot trial ([[Wikipedia:Bots/Requests_for_approval/LivingBot_24|details]]): swap in address lines for name placeholders' );
}
function cleanup( $address ){
// Most commas are bad...
$address = str_replace( ', ', ',', $address );
$address = str_replace( ',', ' ', $address );
$address = str_replace( '-', '–', $address ); // hyphen to dash
$address = titleCase( $address );
// But some are good
$address = preg_replace( '/^([0-9]+[a-z]?)( [0-9]+[a-z]?)( [0-9]+[a-z]? )/', "$1,$2,$3", $address );
$address = preg_replace( '/^([0-9]+[a-z]?)( [0-9]+[a-z]? )/', "$1,$2", $address );
return $address;
}
// converts a string into title case
// customised for addresses
function titleCase( $string ) {
if( strlen( $string ) == 0 ) return $string;
// list of words we don't want to capitalize
$smallWords = array(
'and',
'of'
);
// special words that should be written as-is
$specialWords = array(
'II',
'IV',
'VI',
'III',
'VII'
);
// split the string of letters and spaces only into an array
$allWords = explode( ' ', $string );
foreach ( $allWords as &$word ) {
if ( in_array( $word, $specialWords ) || preg_match( '/^[NSEWC][NSEWC]?[0-9]+$/i', $word ) ) {
$word = strtoupper( $word );
} elseif( in_array( strtolower( $word ), $smallWords ) ){
$word = strtolower( $word );
} else {
$word = ucfirst( strtolower( $word ) );
}
}
// convert the array back to a string
$allWords = implode( ' ', $allWords );
$allWords = strtoupper( $allWords[0] ) . substr( $allWords, 1 );
return $allWords;
}
function extractParameterFromTemplate( $parameters, $template ){
if( !is_array( $parameters ) ) $parameters = array( $parameters );
// Trim links
$template = preg_replace( '/\[\[([^]]+\|)?/', '', $template );
$template = str_replace( ']]', '', $template );
foreach( $parameters as $parameter ){
if( !preg_match('/\| *' . $parameter . " *= *(\[\[([^]]+\|)?)?([^|\n]*)/", $template, $matches ) ) continue;
$result = trim( array_pop( $matches ) );
if ( strlen( $result ) > 0 ) return $result;
}
return false;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment