Last active
May 14, 2020 06:54
-
-
Save haukex/fd76efa16f0b07ce6a7441d9b2265b2a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# cpanm --installdeps . | |
requires 'Mojo::DOM'; | |
requires 'HTML::TreeBuilder::XPath'; | |
requires 'HTML::LinkExtor'; | |
requires 'WWW::Mechanize', '1.97'; | |
requires 'XML::LibXML'; | |
# for runsols.pl: | |
requires 'Path::Class'; | |
requires 'IPC::Run3'; | |
requires 'Test::More'; | |
requires 'FindBin'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!doctype html> | |
<html lang="en"> | |
<head> | |
<meta charset="utf-8"> | |
<title>Hello, World!</title> | |
</head> | |
<body> | |
<p id="thelinks"><!--BEGIN--> | |
<a | |
href | |
= | |
"http://www.example.com/1" | |
> | |
One | |
</a | |
> | |
<a id="Two" title="href="></a> | |
<!-- | |
<a href="http://www.example.com/3">Three</a> | |
--> | |
<a title=' href="http://www.example.com/4">Four' | |
href="http://www.example.com/5">Five</a> | |
<script> | |
console.log(' <a href="http://www.example.com/6">Six</a> '); /* | |
<!-- | |
*/ </script> | |
<a href=http://www.example.com/7>Se<span | |
>ve</span>n</a> | |
<script>/* --> */</script> | |
<!--END--></p> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
use FindBin; | |
use Path::Class qw/file dir/; | |
use IPC::Run3; | |
use Test::More; | |
my @INPUTS = map { file($FindBin::Bin, $_) } | |
qw/ html5.html xhtml.xhtml /; | |
my @SCRIPTS = map { file($FindBin::Bin, $_) } | |
qw/ sol-mojodom.pl sol-treexpath.pl sol-linkextor.pl sol-parser.pl | |
sol-libxml.pl sol-mechanize.pl /; | |
plan tests => @INPUTS * @SCRIPTS - 1; | |
for my $script (@SCRIPTS) { | |
for my $input (@INPUTS) { | |
next if $script->basename=~/libxml/ && $input->basename!~/\.xhtml$/; | |
my $test = $script->relative($FindBin::Bin)." on " | |
.$input->relative($script->parent); | |
subtest $test => sub { plan tests=>3; | |
run3 [$^X, "$script", "$input"], undef, \(my $out), \(my $err); | |
is $?, 0, 'exit code'; | |
is $err, '', 'STDERR'; | |
$out =~ s/^\s*#.*(\n|\z)//m; | |
is $out, $script->basename=~/linkextor/ ? <<'ONE' : <<'TWO', 'STDOUT'; | |
http://www.example.com/1 | |
http://www.example.com/5 | |
http://www.example.com/7 | |
ONE | |
http://www.example.com/1 One | |
http://www.example.com/5 Five | |
http://www.example.com/7 Seven | |
TWO | |
}; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
my $file = shift or die; | |
print "##### XML::LibXML on $file #####\n"; | |
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> }; | |
die "Sorry, I (currently) only handle XHTML\n" unless $html=~/^\s*<\?xml/; | |
use XML::LibXML; | |
my $doc = XML::LibXML->load_xml( string=>$html, no_network=>1, recover=>1 ); | |
my $xpc = XML::LibXML::XPathContext->new($doc); | |
$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml'); | |
my @links = $xpc->findnodes(q{//html:a[@href]}); | |
for my $link (@links) { | |
( my $txt_trim = $link->textContent ) =~ s/^\s+|\s+$//g; | |
print $link->getAttribute('href'), "\t", $txt_trim, "\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
my $file = shift or die; | |
print "##### HTML::LinkExtor on $file #####\n"; | |
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> }; | |
use HTML::LinkExtor; | |
my $p = HTML::LinkExtor->new; | |
$p->marked_sections(1); | |
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE | |
my @links = $p->parse($html)->links; | |
for my $link (@links) { | |
my ($tag, %attrs) = @$link; | |
print $attrs{href}, "\n"; # note: no text contents available | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
my $file = shift or die; | |
print "##### WWW::Mechanize on $file #####\n"; | |
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> }; | |
# based on https://www.perlmonks.org/?node_id=11116482 by Corion, thanks! | |
use WWW::Mechanize 1.97; # 1.97 required for CDATA sections (in our XHTML) | |
my $mech = WWW::Mechanize->new(); | |
$mech->update_html($html); | |
my @links = $mech->links(); | |
for my $link (@links) { | |
print $link->url, "\t", $link->text, "\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
my $file = shift or die; | |
print "##### Mojo::DOM on $file #####\n"; | |
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> }; | |
use Mojo::DOM; | |
my $links = Mojo::DOM->new($html)->find('a[href]'); | |
for my $link (@$links) { | |
( my $txt_trim = $link->all_text ) =~ s/^\s+|\s+$//g; | |
print $link->{href}, "\t", $txt_trim, "\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
my $file = shift or die; | |
print "##### HTML::Parser on $file #####\n"; | |
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> }; | |
# based on https://www.perlmonks.org/?node_id=11116482 by hippo, thanks! | |
use HTML::Parser; | |
my $state = 0; | |
my $p = HTML::Parser->new( | |
api_version => 3, | |
start_h => [ sub { | |
shift eq 'a' or return; | |
my $href = shift->{href} or return; | |
$state = 1; | |
print "$href\t"; | |
shift->handler(text => sub { | |
(my $str = shift) =~ s/^\s+|\s+$//g; | |
print $str; | |
}, 'dtext, self'); | |
}, 'tagname, attr, self'], | |
end_h => [ sub { | |
return unless shift eq 'a' && $state; | |
$state = 0; | |
print "\n"; | |
shift->handler(text => ''); | |
}, 'tagname, self'], | |
); | |
$p->marked_sections(1); | |
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE | |
$p->parse($html); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
my $file = shift or die; | |
print "##### HTML::TreeBuilder::XPath on $file #####\n"; | |
my $html = do { open my $fh, '<', $file or die $!; local $/; <$fh> }; | |
use HTML::TreeBuilder::XPath; | |
my $p = HTML::TreeBuilder::XPath->new; | |
$p->marked_sections(1); | |
$p->xml_mode( $html=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE | |
my @links = $p->parse($html)->findnodes('//a[@href]'); | |
for my $link (@links) { | |
print $link->attr('href'), "\t", $link->as_text_trimmed, "\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[ | |
<!ATTLIST html | |
xmlns:xsi CDATA #FIXED "http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation CDATA #IMPLIED > ]> | |
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" | |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://www.w3.org/1999/xhtml | |
http://www.w3.org/2002/08/xhtml/xhtml1-strict.xsd"> | |
<head> | |
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" /> | |
<title>Hello, World!</title> | |
</head> | |
<body> | |
<p id="thelinks"> | |
<a | |
href | |
= | |
"http://www.example.com/1" | |
> | |
One | |
</a | |
> | |
<a id="Two" title="href="></a> | |
<!-- | |
<a href="http://www.example.com/3">Three</a> | |
--> | |
<a title=' href="http://www.example.com/4">Four' | |
href="http://www.example.com/5">Five</a> | |
<!--BEGIN--> | |
<script type="text/javascript">/*<![CDATA[ | |
</script> | |
*/ console.log(' <a href="http://www.example.com/6">Six</a> '); /* | |
<!-- | |
]]>*/</script> | |
<a href="http://www.example.com/7"><![CDATA[Se]]><span | |
>ve</span>n</a> | |
<script type="text/javascript">/*<![CDATA[ | |
--> | |
]]>*/</script> | |
<![CDATA[ | |
<a href="http://www.example.com/8">Eight</a> | |
]]> | |
<!--END--></p> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment