Skip to content

Instantly share code, notes, and snippets.

Created January 3, 2013 19:51
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save anonymous/4446501 to your computer and use it in GitHub Desktop.
This is a script to build a quick vocabulary of the most frequent words in a language, based on spidering and scraping a list of urls. It shows context and links to a google translate page.
#!/usr/bin/perl
$|=1;
use strict;
use warnings;
use LWP::UserAgent;
use URI::URL;
use HTML::Strip;
use HTML::LinkExtor;
use Data::Dumper;
use List::Util qw( shuffle );
use Text::Sentence qw(split_sentences);
#
# Some global vars
#
my $MAXURLS = 100;
my $PAGEITEMS = 250;
my @nexturls = qw(
http://nl.yahoo.com/?p=us
http://nl.wikipedia.org/wiki/Hoofdpagina
http://nl.wikipedia.org/wiki/Amsterdam
);
#--------------------------------------------------------------------------------
#
# Main program
#
#--------------------------------------------------------------------------------
#
# Main program fetching loop
#
#
my %frequencies;
my %seen_urls;
my %sentences;
my $urlcounter = 1;
while ( my $url = shift @nexturls ) {
warn "[DOC $urlcounter/$MAXURLS] $url\n";
my $doc = LWP::UserAgent->new->get( $url );
sentence_ref ( $doc, \%sentences );
word_frequency( $doc, \%frequencies );
get_urls ( $doc, \@nexturls );
last unless $urlcounter++ < $MAXURLS;
}
#
#
# Spew out results!
#
#
my $lines = 1;
print <<EOH;
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<link rel="stylesheet" href="dwords.css" type="text/css"/>
</head>
<body>
<P>
DIY vocabulary builder. This document contains the $MAXURLS highest frequent used words in dutch
from a selection of dutch websites. You probably came here from
<a href="http://www.einarsen.no/">my weblog</a> or my twitter:
<a href="http://www.twitter.com/matseinarsen/">@matseinarsen</a>.
</P>
<DL>
EOH
print "<DL>\n";
for my $key ( sort { $frequencies{ $b } <=> $frequencies{ $a } } keys %frequencies ) {
my $uckey = uc $key;
my $ucfkey = ucfirst $key;
print <<EOH;
<div class="box" id="$lines">
<DT><a class="title" target="translation" href="http://translate.google.com/#nl|en|$key\">$uckey - $ucfkey - $key </a>
<br/>
$frequencies{ $key } mentions in $MAXURLS documents - $lines most frequent word - <a href="javascript:document.getElementById('$lines').style.opacity = 0.1; ">hide</a> - <a href="javascript:document.getElementById('$lines').style.opacity = 1; ">show</a></DT>
EOH
my %seen;
print map { my $h = $_;
$h =~ s/\b($key)\b/<b>$1<\/b>/gi;
"<DD><a target=\"translation\" href=\"http://translate.google.com/#nl|en|$_\">$h</a></DD>" }
grep { defined $_ }
(sort {length $a <=> length $b }
grep {! $seen{ $_ }++ }
@{ $sentences{$key} }
)[0..5]
;
print "</div>\n";
last if $lines++ > $PAGEITEMS;
}
print "</DL>\n";
#--------------------------------------------------------------------------------
#
# Subs
#
#--------------------------------------------------------------------------------
#
# Get sentence context
#
sub sentence_ref {
my $text = HTML::Strip->new->parse( shift->decoded_content );
my $sentences = shift;
$text =~ s/(\n|\[\d+\])//g;
my @sentences = grep {!( /[\.\!\?](.*)[\.\?\!]/ or /\s\s/ or /[a-z][A-Z]/) } split_sentences( $text );
SENTENCE:
for my $sentence ( @sentences ) {
$sentence =~ s/\s+(\.\,\?\!)/$1/g;
(($sentence =~ tr/ //) > 20 or ($sentence =~ tr/ //) < 4) and next SENTENCE;
WORD: for my $word ( split / /, lc $sentence ) {
$word =~ s/[^\w]//g;
length $word or next WORD;
push @{ $sentences{$word} }, $sentence;
}
}
}
#
# Get the frequency of words in a LWP result object
#
sub word_frequency {
my $text = HTML::Strip->new->parse( shift->decoded_content );
my $frequencies = shift;
$text =~ s/[\s\.\,\)\(\?]+/ /g;
$frequencies->{ $_ } ++ for grep { /^[\w]+$/ and !/[0-9]/ }
split / /, lc $text;
}
#
# Push the urls in a LWP object into a list
#
sub get_urls {
my $document = shift;
my $links = shift;
push @$links, shuffle
grep { /(nl\.|\.nl|Nederlands)/i and
!/(mediawiki|wikimedia|creativecom)/ and
!/\#/ and
!/(rss|css|js|gif|jpg|png)$/i and
!exists $seen_urls{ $_ } }
map { url($_->[2], $document->base)->abs }
HTML::LinkExtor->new->parse( $document->content )->links;
@$links = @$links;
@seen_urls{ @$links } = ();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment