Skip to content

Instantly share code, notes, and snippets.

@gardejo
Created March 12, 2010 14:24
Show Gist options
  • Save gardejo/330352 to your computer and use it in GitHub Desktop.
Save gardejo/330352 to your computer and use it in GitHub Desktop.
A code generator for Lingua::EO::StopWords
#! /usr/bin/perl
use strict;
use warnings;
use utf8;
use URI;
use Web::Scraper;
use YAML::Syck;
my $roots = scraper {
process '//*[@id="bodyContent"]', 'taxonomy' => \&taxonomize;
};
my $taxonomy = $roots->scrape(
URI->new( 'http://wikisource.org/wiki/Baza_Radikaro_Oficiala' )
);
# ...
# todo: generate Lingua::EO::StopWords
# that has stop_words(), is_stop_word(), prepositions(), is_preposition(),
# and so on.
exit;
sub taxonomize {
my $tree = shift;
my ($taxonomy, $group, $part_of_speech);
my @header_query = ('class', 'mw-headline');
foreach my $child ( $tree->content_list ) {
my $element = $child->tag;
if ($element eq 'h2') {
my @headings = $child->look_down(@header_query);
die 'h2 element does not have just one description'
if scalar @headings != 1;
$group = $headings[0]->as_trimmed_text;
}
elsif ($element eq 'h3') {
next
if $child->look_down('id', 'siteSub');
my @headings = $child->look_down(@header_query);
die 'h3 element does not have just one description'
if scalar @headings != 1;
( $part_of_speech = $headings[0]->as_trimmed_text ) =~ s{
\A
["\d\s]*
( .+? )
(?: " .* | \s\W .* | : .* | \z)
}{$1}xms;
}
elsif ($element eq 'p') {
next
if $child->as_trimmed_text =~ m{ \A \( .+ \) \z }xms;
( my $words = $child->as_trimmed_text ) =~ tr{'}{/};
push @{ $taxonomy->{$group}{$part_of_speech} },
split m{\s}, $words;
}
}
return $taxonomy;
}
__END__
=pod
=head1 NAME
generator - A code generator for Lingua::EO::StopWords
=head1 SYNOPSIS
% generator.pl
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment