Skip to content

Instantly share code, notes, and snippets.

@chansen
Created October 3, 2010 12:02
Show Gist options
  • Save chansen/c9884c0817463fa34284 to your computer and use it in GitHub Desktop.
Save chansen/c9884c0817463fa34284 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
{
package encoding::heuristic;
our $Encoding;
BEGIN {
require Encode;
$Encoding = Encode::find_encoding('utf8');
}
sub import {
${^ENCODING} = bless \my $x, __PACKAGE__;
}
sub decode : lvalue {
local ${^ENCODING};
utf8::upgrade($_[1])
unless utf8::decode($_[1]);
$_[1];
}
sub cat_decode {
shift;
return $Encoding->cat_decode(@_);
}
}
BEGIN {
encoding::heuristic::import();
}
use Test::More qw[no_plan];
{
my $str = "\x{263A}" # unicode string
. "\xE2\x98\xBA" # UTF-8 encoded U+263A
. "\xC4" # Latin-1 encoded U+00C4
;
cmp_ok($str, "eq", "\x{263A}\x{263A}\x{c4}", "No mojibake when concatenating");
}
{
my $str = "\xE2\x98\xBA";
utf8::upgrade($str);
cmp_ok($str, "eq", "\x{263A}", "No mojibake when upgrading UTF-8 octets");
}
{
my $str = "\xC4";
utf8::upgrade($str);
cmp_ok($str, "eq", "\x{c4}", "Upgrading native still works");
}
{
my $str = "\xE2\x98\xBA";
utf8::encode($str);
cmp_ok($str, "eq", "\xE2\x98\xBA", "Encoding UTF-8 octets just work");
}
{
my $str = "\xC4";
utf8::encode($str);
cmp_ok($str, "eq", "\xC3\x84", "So does native");
}
cmp_ok("\x{263A}", "eq", "\xE2\x98\xBA", "Equality of unicode and UTF-8 octets");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment