Skip to content

Instantly share code, notes, and snippets.

@fuba
Created December 26, 2008 09:14
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fuba/40029 to your computer and use it in GitHub Desktop.
Save fuba/40029 to your computer and use it in GitHub Desktop.
exthtml.pl extracts contents specified by an xpath from web pages. Cookbook in Japanese: http://fuba.jottit.com/exthtml
#!/usr/bin/perl
use strict;
use warnings;
use Encode;
use Getopt::Long;
use URI;
use LWP::UserAgent;
use HTTP::Cookies::Guess;
use constant CHARSET => 'utf-8';
my $VERSION = '20090121_1';
my $url = pop @ARGV;
my (
$xpath, $referer, $cookie, $agent, $nextlink,
$depth, $as_xml, $verbose, $procedure, $weight,
);
my $result = GetOptions(
"x|xpath=s" => \$xpath,
"e|referer=s" => \$referer,
"c|cookie-jar=s" => \$cookie,
"a|agent=s" => \$agent,
"n|nextlink=s" => \$nextlink,
"d|depth=i" => \$depth,
"s|as-source" => \$as_xml,
"f" => \$verbose,
"p|procedure=s" => \$procedure,
"w=i" => \$weight,
);
$depth += 1;
unless ($url && $xpath) {
die "version $VERSION\nusage: ./exthtml.pl [ -a [AGENT] -e [REFERER] -c [COOKIE_JAR]"
." -n [NEXTPAGE_XPATH] -d [NEXTPAGE_DEPTH] -w [NEXTPAGE_SLEEP(sec)]"
." -p [PROCEDURE: \$v(scalar value), \$n(HTML::Element object), \$u(URI object)]"
." -s -f ] -x [XPATH] [URL]|-";
}
my $ua = LWP::UserAgent->new;
$ua->cookie_jar(HTTP::Cookies::Guess->create(file => $cookie)) if ($cookie);
$ua->agent($agent) if ($agent);
my %options_base = (
xpath => decode(CHARSET, $xpath),
referer => $referer,
ua => $ua,
as_xml => $as_xml,
nextlink => decode(CHARSET, $nextlink),
depth => $depth,
verbose => $verbose,
procedure => $procedure,
weight => $weight || 0,
);
my @url_list;
if ($url eq '-') {
while (my $url_line = <>) {
chomp $url_line;
push @url_list, $url_line;
}
}
else {
if ($url =~ /\[\d+\-\d+\]/) {
push @url_list, &expand_url($url);
}
else {
push @url_list, $url;
}
}
if (@url_list) {
for my $url_line (@url_list) {
my %options = %options_base;
$options{url} = $url_line;
extract(%options);
}
}
exit;
sub expand_url {
my $exp = shift;
my @urls;
my $format = '%d';
if ($exp =~ s/\[(\d+)\-(\d+)\]/[NUM]/) {
my ($start, $end) = ($1 <= $2) ? ($1, $2) : ($2, $1);
if ($start =~ /^0\d/) {
$format = '%0'.length($end).'d';
}
for my $num ($start..$end) {
my $url = $exp;
my $numstr = sprintf($format, $num);
$url =~ s/\[NUM\]/$numstr/;
push @urls, $url;
}
@urls = map {expand_url($_)} @urls;
}
else {
return $exp;
}
return @urls;
}
sub proc {
my ($proc, $n, $v, $u) = @_;
return eval($proc);
}
sub extract {
my %opt = @_;
my $xpath = $opt{xpath};
my $depth = $opt{depth};
my $url = $opt{url};
my $referer = $opt{referer};
my $procedure = $opt{procedure};
my $weight = $opt{weight} || 0;
my %hist;
while ($url && ($depth--)) {
last if ($hist{$url});
$hist{$url} = 1;
print "$url\n" if ($opt{verbose});
my $tree = HTML::TreeBuilder::XPath::Remote->new_from_uri(
$url, $opt{ua}, $referer
);
for my $node ($tree->findnodes($opt{xpath})) {
print "\t" if ($opt{verbose});
my $value = ($opt{as_xml} && $node->isa('HTML::Element'))
? $node->as_XML('<>&"')
: $node->getValue."\n";
$value = proc($procedure, $node, $value, URI->new($url)) if ($procedure);
print encode(CHARSET, $value);
}
$referer = $url;
$url = '';
if ($opt{nextlink}) {
my @urls =
grep /^http/,
map {$_->getValue} $tree->findnodes($opt{nextlink});
$url = $urls[0] if (@urls);
}
$tree->delete;
sleep $weight;
}
}
package HTML::TreeBuilder::XPath::Remote;
use strict;
use warnings;
use List::Util qw( first );
use Encode;
use HTML::TreeBuilder::XPath;
use HTML::ResolveLink;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response::Encoding;
sub new_from_uri {
my ($pkg, $uri, $ua, $referer) = @_;
my $resolver = HTML::ResolveLink->new(
base => $uri,
);
my $html = $resolver->resolve(
$pkg->get($uri, $ua, $referer)
);
return HTML::TreeBuilder::XPath->new_from_content($html);
}
sub get {
my ($self, $uri, $ua, $referer) = @_;
my $html;
$ua ||= LWP::UserAgent->new();
my $req = HTTP::Request->new('GET', $uri);
$req->header(referer => $referer) if ($referer);
my $res = $ua->request($req);
# this detection is based on Web::Scraper.
if ($res->is_success) {
my @encoding = (
$res->encoding,
($res->header('Content-Type') =~ /charset=([\w\-]+)/g),
);
if (eval {require Encode::Detect;}) {
push @encoding, "Detect";
}
push @encoding, "shift-jis";
my $encoding = first {
defined $_ && Encode::find_encoding($_)
} @encoding;
$html = Encode::decode($encoding, $res->content);
}
return $html;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment