Created
February 21, 2012 20:19
-
-
Save jberger/1878637 to your computer and use it in GitHub Desktop.
Find anchor targets using Text::Balanced
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use Data::Dumper; | |
use Text::Balanced qw/extract_bracketed extract_delimited extract_multiple/; | |
my $html = do { local $/; <DATA> }; | |
my @tags = find_anchor_targets($html); | |
print Dumper \@tags; | |
sub find_anchor_targets { | |
my $html = shift; | |
my @tags = extract_multiple( | |
$html, | |
[ sub { extract_bracketed($_[0], '<>') } ], | |
undef, 1 | |
); | |
@tags = | |
map { extract_href($_) } # find related href= | |
grep { /^<a/i } # only anchor begin tags | |
@tags; | |
return @tags; | |
} | |
sub extract_href { | |
my $tag = shift; | |
if($tag =~ /href=(?='|")/gci) { | |
my $text = scalar extract_delimited( $tag, q{'"} ); | |
my $delim = substr $text, 0, 1; | |
$text =~ s/^$delim//; | |
$text =~ s/$delim$//; | |
return $text; | |
} elsif ($tag =~ /href=(.*?)(?:\s|\n|>)/) { | |
return $1; | |
} else { | |
return (); | |
} | |
} | |
__DATA__ | |
Some <a href=link>link text</a> stuff. And a little <A HREF="link2">different link text</a>. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment