-
-
Save anonymous/cd1a08ca0919e43ebfaa81e760e06676 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/lib/Mojo/DOM/HTML.pm b/lib/Mojo/DOM/HTML.pm | |
index f2f70a29b..19e2e2250 100644 | |
--- a/lib/Mojo/DOM/HTML.pm | |
+++ b/lib/Mojo/DOM/HTML.pm | |
@@ -1,7 +1,7 @@ | |
package Mojo::DOM::HTML; | |
use Mojo::Base -base; | |
-use Mojo::Util qw(html_unescape xml_escape); | |
+use Mojo::Util qw(html_attr_unescape html_unescape xml_escape); | |
use Scalar::Util 'weaken'; | |
has tree => sub { ['root'] }; | |
@@ -125,7 +125,7 @@ sub parse { | |
# Empty tag | |
++$closing and next if $key eq '/'; | |
- $attrs{$key} = defined $value ? html_unescape $value : $value; | |
+ $attrs{$key} = defined $value ? html_attr_unescape $value : $value; | |
} | |
# "image" is an alias for "img" | |
diff --git a/lib/Mojo/Util.pm b/lib/Mojo/Util.pm | |
index 6f30be035..02648e35c 100644 | |
--- a/lib/Mojo/Util.pm | |
+++ b/lib/Mojo/Util.pm | |
@@ -51,14 +51,17 @@ my %XML = ( | |
# "Sun, 06 Nov 1994 08:49:37 GMT" and "Sunday, 06-Nov-94 08:49:37 GMT" | |
my $EXPIRES_RE = qr/(\w+\W+\d+\W+\w+\W+\d+\W+\d+:\d+:\d+\W*\w+)/; | |
+# HTML entities | |
+my $ENTITY_RE = qr/&(?:\#((?:[0-9]{1,7}|x[0-9a-fA-F]{1,6}));|(\w+;?))/; | |
+ | |
# Encoding cache | |
my %CACHE; | |
our @EXPORT_OK = ( | |
qw(b64_decode b64_encode camelize class_to_file class_to_path decamelize), | |
qw(decode deprecated dumper encode extract_usage getopt hmac_sha1_sum), | |
- qw(html_unescape md5_bytes md5_sum monkey_patch punycode_decode), | |
- qw(punycode_encode quote secure_compare sha1_bytes sha1_sum), | |
+ qw(html_attr_unescape html_unescape md5_bytes md5_sum monkey_patch), | |
+ qw(punycode_decode punycode_encode quote secure_compare sha1_bytes sha1_sum), | |
qw(split_cookie_header split_header steady_time tablify term_escape trim), | |
qw(unindent unquote url_escape url_unescape xml_escape xor_encode) | |
); | |
@@ -155,10 +158,15 @@ sub getopt { | |
Getopt::Long::Configure($save); | |
} | |
+sub html_attr_unescape { | |
+ my $str = shift; | |
+ $str =~ s/$ENTITY_RE/_decode($1, $2, 1)/geo; | |
+ return $str; | |
+} | |
+ | |
sub html_unescape { | |
my $str = shift; | |
- $str | |
- =~ s/&(?:\#((?:[0-9]{1,7}|x[0-9a-fA-F]{1,6}));|(\w+;?))/_decode($1, $2)/ge; | |
+ $str =~ s/$ENTITY_RE/_decode($1, $2, 0)/geo; | |
return $str; | |
} | |
@@ -368,16 +376,18 @@ sub _adapt { | |
} | |
sub _decode { | |
- my ($point, $name) = @_; | |
+ my ($point, $name, $attr) = @_; | |
# Code point | |
return chr($point !~ /^x/ ? $point : hex $point) unless defined $name; | |
# Named character reference | |
- my $rest = ''; | |
+ my $rest = my $last = ''; | |
while (length $name) { | |
- return $ENTITIES{$name} . reverse $rest if exists $ENTITIES{$name}; | |
- $rest .= chop $name; | |
+ return $ENTITIES{$name} . reverse $rest | |
+ if exists $ENTITIES{$name} | |
+ && (!$attr || $name =~ /;$/ || $last !~ /[A-Za-z0-9=]/); | |
+ $rest .= $last = chop $name; | |
} | |
return '&' . reverse $rest; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment