Skip to content

Instantly share code, notes, and snippets.

@toritori0318
Created October 10, 2010 16:53
Show Gist options
  • Save toritori0318/619373 to your computer and use it in GitHub Desktop.
Save toritori0318/619373 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use strict;
use WWW::Mechanize;
use Data::Dumper;
#use HTML::ContentExtractor;
use HTML::Parser;
use Lingua::EN::Tagger;
use 5.10.0;
my $baseurl = 'http://search.cpan.org/~shay/perl/pod/perl.pod';
my $mech = WWW::Mechanize->new();
$mech->get($baseurl);
my $linknum = 2;
my $max_linknum = 2;
my $interval = 3;
my %followed_link = {};
my %all_word_list = ();
sub getcontent {
my ($mech, $linknum) = @_;
sleep $interval;
return unless ($mech->success() && $mech->is_html());
my $content = $mech->content();
my $text;
my $parser = HTML::Parser->new(
api_version => 3,
text_h => [sub { $text .= shift || ""}, "dtext"],
);
$parser->parse($content);
my $p = new Lingua::EN::Tagger;
my %word_list = $p->get_words( $text );
foreach my $key (keys %word_list){
$all_word_list{$key} += $word_list{$key};
}
$linknum--;
$max_linknum--;
return unless ($linknum > 0 || $max_linknum > 0);
foreach my $link ($mech->find_all_links(text_regex => qr/^perl/, url_abs_regex => qr#\.pod#))
{
my $url = $link->url_abs();
return if $followed_link{$url};
$followed_link{$url} = 1;
say "-- GET " . $url;
eval{
$mech->get($link->url_abs());
};
if($@){
say "url->get error!";
}else{
getcontent($mech, $linknum);
$mech->back();
}
}
}
sub summary {
my $count = 0;
my $reg_exclude = qr/[@_\$\(\)\{\}\[\]\!\.%]/;
foreach my $key (reverse sort { $all_word_list{$a} <=> $all_word_list{$b} } keys %all_word_list){
$key =~ s/$reg_exclude//g;
$key =~ s/^\s//g;
$key =~ s/\s$//g;
say $key." : ".$all_word_list{$key} if $key;
last if $count++ > 100;
}
}
{
getcontent($mech, $linknum);
summary();
}
exit();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment