osfameron/html_strip.pl

## html_strip.pl
use strict; use warnings;

use HTML::Strip;
use Devel::Peek;
use Test::More tests => 3;
use Encode;
use utf8;

=head1 Workaround for HTML::Strip with utf8

As discussed with Zefram and ilmari on #london.pm, thanks!

L<HTML::Strip> doesn't handle utf8 properly, as it's XS and probably not
written to work on characters, only bytes.

By default the parse method, when given unicode, returns a bytestring with no
unicode markings.

A naive way to handle this would be to simply decode_utf8.  This works for
utf8 strings... but not for extended latin1.

A better workaround, suggested by Zefram, is to encode and downgrade first,
then decode after.

NB: this is just a workaround.  Better solutions would be to a) fix HTML::Strip
or b) use HTML::Parser instead

=cut

my @strings = (
    {
        type   => 'ascii',
        string => 'test',
    },
    {
        type   => 'unicode',
        string => "\x{2603}",  # snowman
    },
    {
        type   => 'latin1',
        string => "L\x{e9}on",
    }
);

my $hs = HTML::Strip->new();

for my $record (@strings) {
    my $string = $record->{string};
    my $html = $string . "<br>"; # some sample html to strip

    # my $stripped = parse_simple( $html );   # fails the unicode test
    # my $stripped = parse_unicodey( $html ); # fails the latin1 test
    my $stripped = parse_workaround( $html );

    is( $string, $stripped, $record->{type} );
        # or do { Dump($string); Dump($stripped) };
}

sub parse_simple {
    my $html = shift;
    my $stripped = $hs->parse($html);
    $hs->eof;
    return $stripped;
}

sub parse_unicodey {
    my $html = shift;
    my $stripped = $hs->parse($html);
    $hs->eof;
    return decode_utf8($stripped);
}

sub parse_workaround {
    my $html = shift;
    my $octets = encode_utf8($html);
    utf8::downgrade($octets);
    my $stripped = $hs->parse($octets);
    $hs->eof;
    return decode_utf8($stripped);
}
	use strict; use warnings;

	use HTML::Strip;
	use Devel::Peek;
	use Test::More tests => 3;
	use Encode;
	use utf8;

	=head1 Workaround for HTML::Strip with utf8

	As discussed with Zefram and ilmari on #london.pm, thanks!

	L<HTML::Strip> doesn't handle utf8 properly, as it's XS and probably not
	written to work on characters, only bytes.

	By default the parse method, when given unicode, returns a bytestring with no
	unicode markings.

	A naive way to handle this would be to simply decode_utf8. This works for
	utf8 strings... but not for extended latin1.

	A better workaround, suggested by Zefram, is to encode and downgrade first,
	then decode after.

	NB: this is just a workaround. Better solutions would be to a) fix HTML::Strip
	or b) use HTML::Parser instead

	=cut

	my @strings = (
	{
	type => 'ascii',
	string => 'test',
	},
	{
	type => 'unicode',
	string => "\x{2603}", # snowman
	},
	{
	type => 'latin1',
	string => "L\x{e9}on",
	}
	);

	my $hs = HTML::Strip->new();

	for my $record (@strings) {
	my $string = $record->{string};
	my $html = $string . "<br>"; # some sample html to strip

	# my $stripped = parse_simple( $html ); # fails the unicode test
	# my $stripped = parse_unicodey( $html ); # fails the latin1 test
	my $stripped = parse_workaround( $html );

	is( $string, $stripped, $record->{type} );
	# or do { Dump($string); Dump($stripped) };
	}

	sub parse_simple {
	my $html = shift;
	my $stripped = $hs->parse($html);
	$hs->eof;
	return $stripped;
	}

	sub parse_unicodey {
	my $html = shift;
	my $stripped = $hs->parse($html);
	$hs->eof;
	return decode_utf8($stripped);
	}

	sub parse_workaround {
	my $html = shift;
	my $octets = encode_utf8($html);
	utf8::downgrade($octets);
	my $stripped = $hs->parse($octets);
	$hs->eof;
	return decode_utf8($stripped);
	}