public
Created

workaround for HTML::Strip with utf8

  • Download Gist
html_strip.pl
Perl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
use strict; use warnings;
 
use HTML::Strip;
use Devel::Peek;
use Test::More tests => 3;
use Encode;
use utf8;
 
=head1 Workaround for HTML::Strip with utf8
 
As discussed with Zefram and ilmari on #london.pm, thanks!
 
L<HTML::Strip> doesn't handle utf8 properly, as it's XS and probably not
written to work on characters, only bytes.
 
By default the parse method, when given unicode, returns a bytestring with no
unicode markings.
 
A naive way to handle this would be to simply decode_utf8. This works for
utf8 strings... but not for extended latin1.
 
A better workaround, suggested by Zefram, is to encode and downgrade first,
then decode after.
 
NB: this is just a workaround. Better solutions would be to a) fix HTML::Strip
or b) use HTML::Parser instead
 
=cut
 
my @strings = (
{
type => 'ascii',
string => 'test',
},
{
type => 'unicode',
string => "\x{2603}", # snowman
},
{
type => 'latin1',
string => "L\x{e9}on",
}
);
 
my $hs = HTML::Strip->new();
 
for my $record (@strings) {
my $string = $record->{string};
my $html = $string . "<br>"; # some sample html to strip
 
# my $stripped = parse_simple( $html ); # fails the unicode test
# my $stripped = parse_unicodey( $html ); # fails the latin1 test
my $stripped = parse_workaround( $html );
 
is( $string, $stripped, $record->{type} );
# or do { Dump($string); Dump($stripped) };
}
 
sub parse_simple {
my $html = shift;
my $stripped = $hs->parse($html);
$hs->eof;
return $stripped;
}
 
sub parse_unicodey {
my $html = shift;
my $stripped = $hs->parse($html);
$hs->eof;
return decode_utf8($stripped);
}
 
sub parse_workaround {
my $html = shift;
my $octets = encode_utf8($html);
utf8::downgrade($octets);
my $stripped = $hs->parse($octets);
$hs->eof;
return decode_utf8($stripped);
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.