/gist:4446501

## gistfile1.txt
#!/usr/bin/perl

$|=1;

use strict;
use warnings;

use LWP::UserAgent;
use URI::URL;
use HTML::Strip;
use HTML::LinkExtor;
use Data::Dumper;
use List::Util qw( shuffle );
use Text::Sentence qw(split_sentences);

#
# Some global vars
#
my $MAXURLS   = 100;
my $PAGEITEMS = 250;
my @nexturls  = qw(
                  http://nl.yahoo.com/?p=us
                  http://nl.wikipedia.org/wiki/Hoofdpagina
                  http://nl.wikipedia.org/wiki/Amsterdam
                 );


#--------------------------------------------------------------------------------
#
# Main program
#
#--------------------------------------------------------------------------------

#
# Main program fetching loop
#
#

my %frequencies;
my %seen_urls;
my %sentences;

my $urlcounter = 1;
while ( my $url = shift @nexturls ) {

    warn "[DOC $urlcounter/$MAXURLS] $url\n";
    my $doc = LWP::UserAgent->new->get( $url );

    sentence_ref  ( $doc, \%sentences   );
    word_frequency( $doc, \%frequencies );
    get_urls      ( $doc, \@nexturls    );

    last unless $urlcounter++ < $MAXURLS;

}

#
#
# Spew out results!
#
#

my $lines = 1;

print <<EOH;
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<link rel="stylesheet" href="dwords.css" type="text/css"/>
</head>
<body>
<P>
DIY vocabulary builder. This document contains the $MAXURLS highest frequent used words in dutch
from a selection of dutch websites.  You probably came here from
<a href="http://www.einarsen.no/">my weblog</a> or my twitter:
<a href="http://www.twitter.com/matseinarsen/">@matseinarsen</a>.
</P>
<DL>

EOH

print "<DL>\n";
for my $key ( sort { $frequencies{ $b } <=> $frequencies{ $a } } keys %frequencies ) {
    my $uckey  = uc      $key;
    my $ucfkey = ucfirst $key;

    print <<EOH;
<div class="box" id="$lines">
    <DT><a class="title" target="translation" href="http://translate.google.com/#nl|en|$key\">$uckey - $ucfkey - $key </a>
<br/>
$frequencies{ $key } mentions in $MAXURLS documents - $lines most frequent word - <a href="javascript:document.getElementById('$lines').style.opacity = 0.1; ">hide</a> - <a href="javascript:document.getElementById('$lines').style.opacity = 1; ">show</a></DT>
EOH

    my %seen;
    print map  { my $h =  $_;
                    $h =~ s/\b($key)\b/<b>$1<\/b>/gi;
                 "<DD><a target=\"translation\" href=\"http://translate.google.com/#nl|en|$_\">$h</a></DD>" }
          grep { defined $_ }
          (sort {length $a <=> length $b }
           grep {! $seen{ $_ }++ }
           @{ $sentences{$key} }
          )[0..5]
          ;

    print "</div>\n";
    last if $lines++ > $PAGEITEMS;

}

print "</DL>\n";


#--------------------------------------------------------------------------------
#
# Subs
#
#--------------------------------------------------------------------------------

#
# Get sentence context
#
sub sentence_ref {
    my $text      = HTML::Strip->new->parse( shift->decoded_content );
    my $sentences = shift;
       $text      =~ s/(\n|\[\d+\])//g;

    my @sentences = grep {!( /[\.\!\?](.*)[\.\?\!]/ or /\s\s/ or /[a-z][A-Z]/) } split_sentences( $text );

SENTENCE:
    for my $sentence ( @sentences ) {
        $sentence =~ s/\s+(\.\,\?\!)/$1/g;
        (($sentence =~ tr/ //) > 20 or ($sentence =~ tr/ //) < 4) and next SENTENCE;

WORD:   for my $word ( split / /, lc $sentence ) {
            $word =~ s/[^\w]//g;
            length $word or next WORD;
            push @{ $sentences{$word} }, $sentence;
        }
    }
}


#
# Get the frequency of words in a LWP result object
#
sub word_frequency  {
    my $text        = HTML::Strip->new->parse( shift->decoded_content );
    my $frequencies = shift;

    $text           =~ s/[\s\.\,\)\(\?]+/ /g;

    $frequencies->{ $_ } ++ for grep { /^[\w]+$/ and !/[0-9]/ }
                                split / /, lc $text;
}

#
# Push the urls in a LWP object into a list
#
sub get_urls  {
    my $document = shift;
    my $links    = shift;

    push @$links, shuffle
                  grep { /(nl\.|\.nl|Nederlands)/i            and
                         !/(mediawiki|wikimedia|creativecom)/ and
                         !/\#/                                and
                         !/(rss|css|js|gif|jpg|png)$/i        and
                         !exists $seen_urls{ $_ }                 }
                  map  { url($_->[2], $document->base)->abs       }
                  HTML::LinkExtor->new->parse( $document->content )->links;

    @$links = @$links;

    @seen_urls{ @$links } = ();

}
	#!/usr/bin/perl

	$\|=1;

	use strict;
	use warnings;

	use LWP::UserAgent;
	use URI::URL;
	use HTML::Strip;
	use HTML::LinkExtor;
	use Data::Dumper;
	use List::Util qw( shuffle );
	use Text::Sentence qw(split_sentences);

	#
	# Some global vars
	#
	my $MAXURLS = 100;
	my $PAGEITEMS = 250;
	my @nexturls = qw(
	http://nl.yahoo.com/?p=us
	http://nl.wikipedia.org/wiki/Hoofdpagina
	http://nl.wikipedia.org/wiki/Amsterdam
	);


	#--------------------------------------------------------------------------------
	#
	# Main program
	#
	#--------------------------------------------------------------------------------

	#
	# Main program fetching loop
	#
	#

	my %frequencies;
	my %seen_urls;
	my %sentences;

	my $urlcounter = 1;
	while ( my $url = shift @nexturls ) {

	warn "[DOC $urlcounter/$MAXURLS] $url\n";
	my $doc = LWP::UserAgent->new->get( $url );

	sentence_ref ( $doc, \%sentences );
	word_frequency( $doc, \%frequencies );
	get_urls ( $doc, \@nexturls );

	last unless $urlcounter++ < $MAXURLS;

	}

	#
	#
	# Spew out results!
	#
	#

	my $lines = 1;

	print <<EOH;
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
	<html xmlns="http://www.w3.org/1999/xhtml">
	<head>
	<link rel="stylesheet" href="dwords.css" type="text/css"/>
	</head>
	<body>
	<P>
	DIY vocabulary builder. This document contains the $MAXURLS highest frequent used words in dutch
	from a selection of dutch websites. You probably came here from
	<a href="http://www.einarsen.no/">my weblog</a> or my twitter:
	<a href="http://www.twitter.com/matseinarsen/">@matseinarsen</a>.
	</P>
	<DL>

	EOH

	print "<DL>\n";
	for my $key ( sort { $frequencies{ $b } <=> $frequencies{ $a } } keys %frequencies ) {
	my $uckey = uc $key;
	my $ucfkey = ucfirst $key;

	print <<EOH;
	<div class="box" id="$lines">
	<DT><a class="title" target="translation" href="http://translate.google.com/#nl\|en\|$key\">$uckey - $ucfkey - $key </a>
	<br/>
	$frequencies{ $key } mentions in $MAXURLS documents - $lines most frequent word - <a href="javascript:document.getElementById('$lines').style.opacity = 0.1; ">hide</a> - <a href="javascript:document.getElementById('$lines').style.opacity = 1; ">show</a></DT>
	EOH

	my %seen;
	print map { my $h = $_;
	$h =~ s/\b($key)\b/<b>$1<\/b>/gi;
	"<DD><a target=\"translation\" href=\"http://translate.google.com/#nl\|en\|$_\">$h</a></DD>" }
	grep { defined $_ }
	(sort {length $a <=> length $b }
	grep {! $seen{ $_ }++ }
	@{ $sentences{$key} }
	)[0..5]
	;

	print "</div>\n";
	last if $lines++ > $PAGEITEMS;

	}

	print "</DL>\n";



	#--------------------------------------------------------------------------------
	#
	# Subs
	#
	#--------------------------------------------------------------------------------

	#
	# Get sentence context
	#
	sub sentence_ref {
	my $text = HTML::Strip->new->parse( shift->decoded_content );
	my $sentences = shift;
	$text =~ s/(\n\|\[\d+\])//g;

	my @sentences = grep {!( /[\.\!\?](.*)[\.\?\!]/ or /\s\s/ or /[a-z][A-Z]/) } split_sentences( $text );

	SENTENCE:
	for my $sentence ( @sentences ) {
	$sentence =~ s/\s+(\.\,\?\!)/$1/g;
	(($sentence =~ tr/ //) > 20 or ($sentence =~ tr/ //) < 4) and next SENTENCE;

	WORD: for my $word ( split / /, lc $sentence ) {
	$word =~ s/[^\w]//g;
	length $word or next WORD;
	push @{ $sentences{$word} }, $sentence;
	}
	}
	}


	#
	# Get the frequency of words in a LWP result object
	#
	sub word_frequency {
	my $text = HTML::Strip->new->parse( shift->decoded_content );
	my $frequencies = shift;

	$text =~ s/[\s\.\,\)\(\?]+/ /g;

	$frequencies->{ $_ } ++ for grep { /^[\w]+$/ and !/[0-9]/ }
	split / /, lc $text;
	}

	#
	# Push the urls in a LWP object into a list
	#
	sub get_urls {
	my $document = shift;
	my $links = shift;

	push @$links, shuffle
	grep { /(nl\.\|\.nl\|Nederlands)/i and
	!/(mediawiki\|wikimedia\|creativecom)/ and
	!/\#/ and
	!/(rss\|css\|js\|gif\|jpg\|png)$/i and
	!exists $seen_urls{ $_ } }
	map { url($_->[2], $document->base)->abs }
	HTML::LinkExtor->new->parse( $document->content )->links;

	@$links = @$links;

	@seen_urls{ @$links } = ();

	}