Instantly share code, notes, and snippets.

Embed
What would you like to do?
dewiki-category
#!/usr/bin/perl
#
# Extract all category names from german Wikipedia database dump
# http://dumps.wikimedia.org/dewiki/latest/dewiki-latest-category.sql.gz
#
# bzcat dewiki-latest-category.sql.gz | ./parse.pl
#
use strict;
while (<>) {
next unless s/^INSERT INTO `category` VALUES //;
s/^\(|;\s*$//g;
my @rows = split /\),\(/;
foreach ( @rows ) {
my @p = split ',';
next if pop @p; # ignore hidden categories
my $files = pop @p;
my $subcats = pop @p;
my $pages = pop @p;
# next unless $files or $pages or $subcats; # ignore empty categories
my $id = shift @p;
my $title = join ',', @p;
$title = substr($title, 1, length($title)-2);
next if $title =~ /^(Benutzer|Portal|Wikipedia|Datei|Vorlage|Bild|Hilfe):/;
print "$title\n";
# print join("\t", $title, $pages, $subcats, $files)."\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment