haukex/cpanfile

## cpanfile
# cpanm --installdeps .
requires 'Mojo::DOM';
requires 'HTML::TreeBuilder::XPath';
requires 'HTML::LinkExtor';
requires 'WWW::Mechanize', '1.97';
requires 'XML::LibXML';
# for runsols.pl:
requires 'Path::Class';
requires 'IPC::Run3';
requires 'Test::More';
requires 'FindBin';

## html5.html
<!doctype html>
<html lang="en">
<head>
	<meta charset="utf-8">
	<title>Hello, World!</title>
</head>
<body>
<p id="thelinks"><!--BEGIN-->

<a
href
=
"http://www.example.com/1"
>
One
</a
>
<a id="Two" title="href="></a>
<!--
<a href="http://www.example.com/3">Three</a>
-->
<a title=' href="http://www.example.com/4">Four'
href="http://www.example.com/5">Five</a>
<script>
console.log(' <a href="http://www.example.com/6">Six</a> '); /*
<!--
*/ </script>
<a href=http://www.example.com/7>Se<span
>v&#101;</span>n</a>
<script>/* --> */</script>

<!--END--></p>
</body>
</html>

## runsols.pl
#!/usr/bin/env perl
use warnings;
use strict;
use FindBin;
use Path::Class qw/file dir/;
use IPC::Run3;
use Test::More;

my @INPUTS = map { file($FindBin::Bin, $_) }
	qw/ html5.html xhtml.xhtml /;
my @SCRIPTS = map { file($FindBin::Bin, $_) }
	qw/ sol-mojodom.pl sol-treexpath.pl sol-linkextor.pl sol-parser.pl
		sol-libxml.pl sol-mechanize.pl /;
plan tests => @INPUTS * @SCRIPTS - 1;

for my $script (@SCRIPTS) {
	for my $input (@INPUTS) {
		next if $script->basename=~/libxml/ && $input->basename!~/\.xhtml$/;
		my $test = $script->relative($FindBin::Bin)." on "
			.$input->relative($script->parent);
		subtest $test => sub { plan tests=>3;
			run3 [$^X, "$script", "$input"], undef, \(my $out), \(my $err);
			is $?, 0, 'exit code';
			is $err, '', 'STDERR';
			$out =~ s/^\s*#.*(\n|\z)//m;
			is $out, $script->basename=~/linkextor/ ? <<'ONE' : <<'TWO', 'STDOUT';
http://www.example.com/1
http://www.example.com/5
http://www.example.com/7
ONE
http://www.example.com/1	One
http://www.example.com/5	Five
http://www.example.com/7	Seven
TWO
		};
	}
}

## sol-libxml.pl
#!/usr/bin/env perl
use warnings;
use strict;

my $file = shift or die;
print "##### XML::LibXML on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };
die "Sorry, I (currently) only handle XHTML\n" unless $html=~/^\s*<\?xml/;

use XML::LibXML;
my $doc = XML::LibXML->load_xml( string=>$html, no_network=>1, recover=>1 );
my $xpc = XML::LibXML::XPathContext->new($doc);
$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml');
my @links = $xpc->findnodes(q{//html:a[@href]});
for my $link (@links) {
	( my $txt_trim = $link->textContent ) =~ s/^\s+|\s+$//g;
	print $link->getAttribute('href'), "\t", $txt_trim, "\n";
}

## sol-linkextor.pl
#!/usr/bin/env perl
use warnings;
use strict;

my $file = shift or die;
print "##### HTML::LinkExtor on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };

use HTML::LinkExtor;
my $p = HTML::LinkExtor->new;
$p->marked_sections(1);
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE
my @links = $p->parse($html)->links;
for my $link (@links) {
	my ($tag, %attrs) = @$link;
	print $attrs{href}, "\n"; # note: no text contents available
}

## sol-mechanize.pl
#!/usr/bin/env perl
use warnings;
use strict;

my $file = shift or die;
print "##### WWW::Mechanize on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };

# based on https://www.perlmonks.org/?node_id=11116482 by Corion, thanks!
use WWW::Mechanize 1.97; # 1.97 required for CDATA sections (in our XHTML)
my $mech = WWW::Mechanize->new();
$mech->update_html($html);
my @links = $mech->links();
for my $link (@links) {
	print $link->url, "\t", $link->text, "\n";
}

## sol-mojodom.pl
#!/usr/bin/env perl
use warnings;
use strict;

my $file = shift or die;
print "##### Mojo::DOM on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };

use Mojo::DOM;
my $links = Mojo::DOM->new($html)->find('a[href]');
for my $link (@$links) {
	( my $txt_trim = $link->all_text ) =~ s/^\s+|\s+$//g;
	print $link->{href}, "\t", $txt_trim, "\n";
}

## sol-parser.pl
#!/usr/bin/env perl
use warnings;
use strict;

my $file = shift or die;
print "##### HTML::Parser on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };

# based on https://www.perlmonks.org/?node_id=11116482 by hippo, thanks!
use HTML::Parser;
my $state = 0;
my $p = HTML::Parser->new(
	api_version => 3,
	start_h => [ sub {
			shift eq 'a' or return;
			my $href = shift->{href} or return;
			$state = 1;
			print "$href\t";
			shift->handler(text => sub {
				(my $str = shift) =~ s/^\s+|\s+$//g;
				print $str;
			}, 'dtext, self');
		}, 'tagname, attr, self'],
	end_h => [ sub {
			return unless shift eq 'a' && $state;
			$state = 0;
			print "\n";
			shift->handler(text => '');
		}, 'tagname, self'],
);
$p->marked_sections(1);
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE
$p->parse($html);

## sol-treexpath.pl
#!/usr/bin/env perl
use warnings;
use strict;

my $file = shift or die;
print "##### HTML::TreeBuilder::XPath on $file #####\n";
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> };

use HTML::TreeBuilder::XPath;
my $p = HTML::TreeBuilder::XPath->new;
$p->marked_sections(1);
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE
my @links = $p->parse($html)->findnodes('//a[@href]');
for my $link (@links) {
	print $link->attr('href'), "\t", $link->as_text_trimmed, "\n";
}

## xhtml.xhtml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[
<!ATTLIST html
	xmlns:xsi CDATA #FIXED "http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation CDATA #IMPLIED  > ]>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://www.w3.org/1999/xhtml
	http://www.w3.org/2002/08/xhtml/xhtml1-strict.xsd">
<head>
	<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
	<title>Hello, World!</title>
</head>
<body>
<p id="thelinks">

<a
href
=
"http://www.example.com/1"
>
One
</a
>
<a id="Two" title="href="></a>
<!--
<a href="http://www.example.com/3">Three</a>
-->
<a title=' href="http://www.example.com/4">Four'
href="http://www.example.com/5">Five</a>
<!--BEGIN-->
<script type="text/javascript">/*<![CDATA[
</script>
*/ console.log(' <a href="http://www.example.com/6">Six</a> '); /*
<!--
]]>*/</script>
<a href="http://www.example.com/7"><![CDATA[Se]]><span
>v&#101;</span>n</a>
<script type="text/javascript">/*<![CDATA[
-->
]]>*/</script>
<![CDATA[
<a href="http://www.example.com/8">Eight</a>
]]>

<!--END--></p>
</body>
</html>
	# cpanm --installdeps .
	requires 'Mojo::DOM';
	requires 'HTML::TreeBuilder::XPath';
	requires 'HTML::LinkExtor';
	requires 'WWW::Mechanize', '1.97';
	requires 'XML::LibXML';
	# for runsols.pl:
	requires 'Path::Class';
	requires 'IPC::Run3';
	requires 'Test::More';
	requires 'FindBin';
	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<title>Hello, World!</title>
	</head>
	<body>
	<p id="thelinks"><!--BEGIN-->

	<a
	href
	=
	"http://www.example.com/1"
	>
	One
	</a
	>
	<a id="Two" title="href="></a>
	<!--
	<a href="http://www.example.com/3">Three</a>
	-->
	<a title=' href="http://www.example.com/4">Four'
	href="http://www.example.com/5">Five</a>
	<script>
	console.log(' <a href="http://www.example.com/6">Six</a> '); /*
	<!--
	*/ </script>
	<a href=http://www.example.com/7>Se<span
	>ve</span>n</a>
	<script>/* --> */</script>

	<!--END--></p>
	</body>
	</html>
	#!/usr/bin/env perl
	use warnings;
	use strict;
	use FindBin;
	use Path::Class qw/file dir/;
	use IPC::Run3;
	use Test::More;

	my @INPUTS = map { file($FindBin::Bin, $_) }
	qw/ html5.html xhtml.xhtml /;
	my @SCRIPTS = map { file($FindBin::Bin, $_) }
	qw/ sol-mojodom.pl sol-treexpath.pl sol-linkextor.pl sol-parser.pl
	sol-libxml.pl sol-mechanize.pl /;
	plan tests => @INPUTS * @SCRIPTS - 1;

	for my $script (@SCRIPTS) {
	for my $input (@INPUTS) {
	next if $script->basename=~/libxml/ && $input->basename!~/\.xhtml$/;
	my $test = $script->relative($FindBin::Bin)." on "
	.$input->relative($script->parent);
	subtest $test => sub { plan tests=>3;
	run3 [$^X, "$script", "$input"], undef, \(my $out), \(my $err);
	is $?, 0, 'exit code';
	is $err, '', 'STDERR';
	$out =~ s/^\s#.(\n\|\z)//m;
	is $out, $script->basename=~/linkextor/ ? <<'ONE' : <<'TWO', 'STDOUT';
	http://www.example.com/1
	http://www.example.com/5
	http://www.example.com/7
	ONE
	http://www.example.com/1 One
	http://www.example.com/5 Five
	http://www.example.com/7 Seven
	TWO
	};
	}
	}
	<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[
	<!ATTLIST html
	xmlns:xsi CDATA #FIXED "http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation CDATA #IMPLIED > ]>
	<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://www.w3.org/1999/xhtml
	http://www.w3.org/2002/08/xhtml/xhtml1-strict.xsd">
	<head>
	<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
	<title>Hello, World!</title>
	</head>
	<body>
	<p id="thelinks">

	<a
	href
	=
	"http://www.example.com/1"
	>
	One
	</a
	>
	<a id="Two" title="href="></a>
	<!--
	<a href="http://www.example.com/3">Three</a>
	-->
	<a title=' href="http://www.example.com/4">Four'
	href="http://www.example.com/5">Five</a>
	<!--BEGIN-->
	<script type="text/javascript">/*<![CDATA[
	</script>
	/ console.log(' <a href="http://www.example.com/6">Six</a> '); /
	<!--
	]]>*/</script>
	<a href="http://www.example.com/7"><![CDATA[Se]]><span
	>ve</span>n</a>
	<script type="text/javascript">/*<![CDATA[
	-->
	]]>*/</script>
	<![CDATA[
	<a href="http://www.example.com/8">Eight</a>
	]]>

	<!--END--></p>
	</body>
	</html>