Created
September 1, 2015 15:24
-
-
Save he7d3r/c7927e047d094d72d057 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# Code : Dake | |
use strict; | |
use Parse::MediaWikiDump; | |
use utf8; | |
my $dump = shift(@ARGV) or die "Please specify a dump file"; | |
my $pages = Parse::MediaWikiDump::Pages->new($dump); | |
my $page; | |
my $key; | |
my $result = "\nWords by frequency on Wikipedia (all namespaces):\n"; | |
my %words = (); | |
my $t1 = localtime( ); | |
my $t2; | |
binmode STDOUT, ":utf8"; | |
while(defined($page = $pages->next)) { | |
#main namespace only | |
#next unless $page->namespace eq ''; | |
my $text = $page->text; | |
while ($$text =~ /([a-záàâãçéêíñóôõúüA-ZÁÀÂÃÇÉÊÍÑÓÒÔÕÚÜ][a-záàâãçéêíñóôõúü'-]*[a-záàâãçéêíñóôõúüA-ZÁÀÂÃÇÉÊÍÑÓÒÔÕÚÜ])/g ) { #/ (a \w+[aeio]r)\W/gi | |
$words{lc($1)}++; | |
} | |
} | |
#http://www.devdaily.com/perl/edu/qanda/plqa00016 | |
sub hashValueDescendingNum { | |
$words{$b} <=> $words{$a}; | |
} | |
foreach $key (sort hashValueDescendingNum (keys(%words))) { | |
$result = $result . "$words{$key}\t$key\n"; | |
} | |
$t2 = localtime( ); | |
$result = $result . "Start:\t$t1\nEnd:\t$t2"; | |
open (MYFILE, '>>wp-words-by-frequency-all-ns.txt'); | |
print MYFILE $result; | |
close (MYFILE); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment