Skip to content

Instantly share code, notes, and snippets.

@jberger
Created February 21, 2012 20:19
Show Gist options
  • Save jberger/1878637 to your computer and use it in GitHub Desktop.
Save jberger/1878637 to your computer and use it in GitHub Desktop.
Find anchor targets using Text::Balanced
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
use Text::Balanced qw/extract_bracketed extract_delimited extract_multiple/;
my $html = do { local $/; <DATA> };
my @tags = find_anchor_targets($html);
print Dumper \@tags;
sub find_anchor_targets {
my $html = shift;
my @tags = extract_multiple(
$html,
[ sub { extract_bracketed($_[0], '<>') } ],
undef, 1
);
@tags =
map { extract_href($_) } # find related href=
grep { /^<a/i } # only anchor begin tags
@tags;
return @tags;
}
sub extract_href {
my $tag = shift;
if($tag =~ /href=(?='|")/gci) {
my $text = scalar extract_delimited( $tag, q{'"} );
my $delim = substr $text, 0, 1;
$text =~ s/^$delim//;
$text =~ s/$delim$//;
return $text;
} elsif ($tag =~ /href=(.*?)(?:\s|\n|>)/) {
return $1;
} else {
return ();
}
}
__DATA__
Some <a href=link>link text</a> stuff. And a little <A HREF="link2">different link text</a>.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment