Skip to content

Instantly share code, notes, and snippets.

@toniher
Last active August 29, 2015 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toniher/9568414 to your computer and use it in GitHub Desktop.
Save toniher/9568414 to your computer and use it in GitHub Desktop.
Catalan culture challenge Wikipedia 2014 http://en.wikipedia.org/wiki/Wikipedia:Catalan_culture_challenge/list script for checking page length in English Wikipedia and coverage in other Wikimedia sister projects.
#!/usr/bin/env perl -w
use MediaWiki::Bot;
use LWP::Simple qw(get);
use JSON qw(from_json);
use URI::Escape;
use Data::Dumper;
use utf8;
binmode STDOUT, ":utf8";
my $user = 'xxx'; my $pass = 'xxx';
#Create a MediaWiki::Bot object
my $enreader = MediaWiki::Bot->new({
host => 'en.wikipedia.org',
login_data => { username => $user, password => $pass },
});
# Read wikisites correspondences
my $urlsites = "http://www.wikidata.org/w/api.php?action=sitematrix&format=json";
my %sites = &get_sites( from_json(get($urlsites)) );
#print Dumper(%sites);
# Gotta get the list
my $page = $enreader->get_text("Wikipedia:Catalan_culture_challenge/list");
my (@lines) = split("\n", $page );
print "Article\tNumber iw\tList iw\n";
foreach my $line ( @lines ) {
if ($line=~/^\#/) {
my ($entry) = $line =~/\[\[\s*(.*)\s*\]\]/;
#my $entry = "Joan Alcover";
&process_page( $entry )
}
}
sub get_sites {
my $object = shift;
my %sites;
foreach my $entry ( keys %{$object->{"sitematrix"}} ) {
# If a hash and about a site
if ( (ref $object->{"sitematrix"}->{$entry} eq ref {}) && defined( $object->{"sitematrix"}->{$entry}->{"site"} ) ) {
foreach my $site ( @{$object->{"sitematrix"}->{$entry}->{"site"}} ) {
$sites{$site->{"dbname"}} = $site->{"url"};
}
}
}
return %sites;
}
sub process_page {
my $entry = shift;
#Process API to get length
my $url = "http://en.wikipedia.org/w/api.php?action=query&titles=".uri_escape_utf8($entry)."&prop=info&format=json&redirects";
my $jsonobj = from_json(get($url));
my $name = &get_redirect( $jsonobj, $entry );
print $name, "\t";
my $wikidata_url = "http://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=".uri_escape_utf8($name)."&languages=en&format=json";
my %listiw = &get_iw( from_json(get($wikidata_url)) );
my ( @listiw ) = keys %listiw;
print $#listiw, "\t";
#print &process_iw(@listiw), "\t";
print &process_iw_len( \%listiw), "\n";
sleep(3);
}
# Return redirected page if exists
sub get_redirect {
my $object = shift;
my $entry = shift;
if ( defined( $object->{"query"}->{"redirects"} ) ) {
return $object->{"query"}->{"redirects"}->[0]->{"to"};
} else {
return $entry;
}
}
# Return interwiki list
sub get_iw {
my ( @iw ) = ();
my $object = shift;
foreach my $page ( keys %{$object->{"entities"}} ){
return %{$object->{"entities"}->{$page}->{"sitelinks"}};
}
return @iw;
}
#Return list locales
sub process_iw {
my ( @iw ) = @_;
my @arr = ();
foreach my $i ( sort ( @iw ) ) {
if ( defined( $sites{$i} ) ) {
$i=~s/wiki//g;
push(@arr, $i);
}
}
return join(", ", @arr);
}
#Return list lengths
sub process_iw_len {
my $iwhash = shift;
my @arr = ();
foreach my $i ( sort ( keys %{$iwhash} ) ) {
my $title = $iwhash->{$i}->{"title"};
my $site = $iwhash->{$i}->{"site"};
my $len = get_length( $site, $title );
push(@arr, $len);
}
return join(", ", @arr);
}
# Length of page
sub get_length {
my $site = shift;
my $entry = shift;
if ( defined( $sites{$site} ) ) {
my $url = $sites{$site}."/w/api.php?action=query&titles=".uri_escape_utf8($entry)."&prop=info&format=json&redirects";
my $object = from_json(get($url));
if ( $object ) {
if ( $object->{"query"}->{"pages"}->{"-1"} ) {
return $site.":".0;
}
foreach my $page ( keys %{$object->{"query"}->{"pages"}} ){
return $site.":".$object->{"query"}->{"pages"}->{$page}->{"length"};
}
} else {
return $site.":".-1;
}
} else {
return $site.":".-1;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment