Skip to content

Instantly share code, notes, and snippets.

@haukex
Last active May 14, 2020 06:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haukex/fd76efa16f0b07ce6a7441d9b2265b2a to your computer and use it in GitHub Desktop.
Save haukex/fd76efa16f0b07ce6a7441d9b2265b2a to your computer and use it in GitHub Desktop.
# cpanm --installdeps .
requires 'Mojo::DOM';
requires 'HTML::TreeBuilder::XPath';
requires 'HTML::LinkExtor';
requires 'WWW::Mechanize', '1.97';
requires 'XML::LibXML';
# for runsols.pl:
requires 'Path::Class';
requires 'IPC::Run3';
requires 'Test::More';
requires 'FindBin';
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Hello, World!</title>
</head>
<body>
<p id="thelinks"><!--BEGIN-->
<a
href
=
"http://www.example.com/1"
>
One
</a
>
<a id="Two" title="href="></a>
<!--
<a href="http://www.example.com/3">Three</a>
-->
<a title=' href="http://www.example.com/4">Four'
href="http://www.example.com/5">Five</a>
<script>
console.log(' <a href="http://www.example.com/6">Six</a> '); /*
<!--
*/ </script>
<a href=http://www.example.com/7>Se<span
>v&#101;</span>n</a>
<script>/* --> */</script>
<!--END--></p>
</body>
</html>
#!/usr/bin/env perl
use warnings;
use strict;
use FindBin;
use Path::Class qw/file dir/;
use IPC::Run3;
use Test::More;
my @INPUTS = map { file($FindBin::Bin, $_) }
qw/ html5.html xhtml.xhtml /;
my @SCRIPTS = map { file($FindBin::Bin, $_) }
qw/ sol-mojodom.pl sol-treexpath.pl sol-linkextor.pl sol-parser.pl
sol-libxml.pl sol-mechanize.pl /;
plan tests => @INPUTS * @SCRIPTS - 1;
for my $script (@SCRIPTS) {
for my $input (@INPUTS) {
next if $script->basename=~/libxml/ && $input->basename!~/\.xhtml$/;
my $test = $script->relative($FindBin::Bin)." on "
.$input->relative($script->parent);
subtest $test => sub { plan tests=>3;
run3 [$^X, "$script", "$input"], undef, \(my $out), \(my $err);
is $?, 0, 'exit code';
is $err, '', 'STDERR';
$out =~ s/^\s*#.*(\n|\z)//m;
is $out, $script->basename=~/linkextor/ ? <<'ONE' : <<'TWO', 'STDOUT';
http://www.example.com/1
http://www.example.com/5
http://www.example.com/7
ONE
http://www.example.com/1 One
http://www.example.com/5 Five
http://www.example.com/7 Seven
TWO
};
}
}
#!/usr/bin/env perl
use warnings;
use strict;
my $file = shift or die;
print "##### XML::LibXML on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };
die "Sorry, I (currently) only handle XHTML\n" unless $html=~/^\s*<\?xml/;
use XML::LibXML;
my $doc = XML::LibXML->load_xml( string=>$html, no_network=>1, recover=>1 );
my $xpc = XML::LibXML::XPathContext->new($doc);
$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml');
my @links = $xpc->findnodes(q{//html:a[@href]});
for my $link (@links) {
( my $txt_trim = $link->textContent ) =~ s/^\s+|\s+$//g;
print $link->getAttribute('href'), "\t", $txt_trim, "\n";
}
#!/usr/bin/env perl
use warnings;
use strict;
my $file = shift or die;
print "##### HTML::LinkExtor on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };
use HTML::LinkExtor;
my $p = HTML::LinkExtor->new;
$p->marked_sections(1);
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE
my @links = $p->parse($html)->links;
for my $link (@links) {
my ($tag, %attrs) = @$link;
print $attrs{href}, "\n"; # note: no text contents available
}
#!/usr/bin/env perl
use warnings;
use strict;
my $file = shift or die;
print "##### WWW::Mechanize on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };
# based on https://www.perlmonks.org/?node_id=11116482 by Corion, thanks!
use WWW::Mechanize 1.97; # 1.97 required for CDATA sections (in our XHTML)
my $mech = WWW::Mechanize->new();
$mech->update_html($html);
my @links = $mech->links();
for my $link (@links) {
print $link->url, "\t", $link->text, "\n";
}
#!/usr/bin/env perl
use warnings;
use strict;
my $file = shift or die;
print "##### Mojo::DOM on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };
use Mojo::DOM;
my $links = Mojo::DOM->new($html)->find('a[href]');
for my $link (@$links) {
( my $txt_trim = $link->all_text ) =~ s/^\s+|\s+$//g;
print $link->{href}, "\t", $txt_trim, "\n";
}
#!/usr/bin/env perl
use warnings;
use strict;
my $file = shift or die;
print "##### HTML::Parser on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };
# based on https://www.perlmonks.org/?node_id=11116482 by hippo, thanks!
use HTML::Parser;
my $state = 0;
my $p = HTML::Parser->new(
api_version => 3,
start_h => [ sub {
shift eq 'a' or return;
my $href = shift->{href} or return;
$state = 1;
print "$href\t";
shift->handler(text => sub {
(my $str = shift) =~ s/^\s+|\s+$//g;
print $str;
}, 'dtext, self');
}, 'tagname, attr, self'],
end_h => [ sub {
return unless shift eq 'a' && $state;
$state = 0;
print "\n";
shift->handler(text => '');
}, 'tagname, self'],
);
$p->marked_sections(1);
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE
$p->parse($html);
#!/usr/bin/env perl
use warnings;
use strict;
my $file = shift or die;
print "##### HTML::TreeBuilder::XPath on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };
use HTML::TreeBuilder::XPath;
my $p = HTML::TreeBuilder::XPath->new;
$p->marked_sections(1);
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE
my @links = $p->parse($html)->findnodes('//a[@href]');
for my $link (@links) {
print $link->attr('href'), "\t", $link->as_text_trimmed, "\n";
}
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[
<!ATTLIST html
xmlns:xsi CDATA #FIXED "http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation CDATA #IMPLIED > ]>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.w3.org/1999/xhtml
http://www.w3.org/2002/08/xhtml/xhtml1-strict.xsd">
<head>
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
<title>Hello, World!</title>
</head>
<body>
<p id="thelinks">
<a
href
=
"http://www.example.com/1"
>
One
</a
>
<a id="Two" title="href="></a>
<!--
<a href="http://www.example.com/3">Three</a>
-->
<a title=' href="http://www.example.com/4">Four'
href="http://www.example.com/5">Five</a>
<!--BEGIN-->
<script type="text/javascript">/*<![CDATA[
</script>
*/ console.log(' <a href="http://www.example.com/6">Six</a> '); /*
<!--
]]>*/</script>
<a href="http://www.example.com/7"><![CDATA[Se]]><span
>v&#101;</span>n</a>
<script type="text/javascript">/*<![CDATA[
-->
]]>*/</script>
<![CDATA[
<a href="http://www.example.com/8">Eight</a>
]]>
<!--END--></p>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment