Created
August 22, 2015 19:17
-
-
Save briandfoy/18d188e11c74e96a2799 to your computer and use it in GitHub Desktop.
Perly test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/Users/brian/bin/perls/perl5.22.0 | |
use utf8; | |
use Encode qw(encode decode); | |
use v5.10; | |
use open qw(:std :utf8); | |
use Data::Dumper; | |
use HTTP::Tiny; | |
use Devel::Peek; | |
=pod | |
Remember: | |
Octets are a representation of data. | |
A Perl string is an abstract representation where we don't think about storage. | |
Many people think the verbs go the opposite way they do. | |
Encoding goes to octets, which is the physical storage | |
encode turns a Perl string into octets | |
Decoding goes from the physical storage to character strings | |
decode turns octets into a Perl string | |
Perl mostly handles the translation to and from physical storage, | |
and many of the modules we use are going to handle the conversion. | |
The cases where they don't is most likely because they guess at the | |
encoding and get it wrong: | |
The headers lie, because the creator saved the document incorrectly | |
The meta headers have the same problem | |
=cut | |
my $content = 'Nóirín Plunkett'; # this is already UTF-8 | |
# I like to use "octets" in the var name to remind me which way | |
# I'm going. | |
my $latin1_octets = encode("iso-8859-1", $content); | |
{ | |
say "\n-----Start with UTF-8; Do nothing"; | |
my $decoded_response = $content; | |
say $decoded_response; | |
} | |
{ | |
say "\n-----Start with UTF-8; Encode only"; | |
my $decoded_response = encode('UTF-8', $content); | |
say $decoded_response; | |
} | |
{ | |
say "\n-----Start with UTF-8; Decode, then encode"; | |
my $decoded_response = decode('UTF-8', $content); | |
$decoded_response = encode('UTF-8', $decoded_response); | |
say $decoded_response; | |
} | |
{ | |
say "\n-----Start with UTF-8; Decode only"; | |
my $decoded_response = decode('UTF-8', $content); | |
say $decoded_response; | |
} | |
{ | |
say "\n-----Start with UTF-8; Decode only"; | |
my $decoded_response = decode('UTF-8', $content); | |
say $decoded_response; | |
} | |
{ | |
say "\n-----Start with Latin1; Do nothing"; | |
my $decoded_response = $latin1_octets; | |
say $decoded_response; | |
} | |
{ | |
say "\n-----Start with Latin1; Decode only"; | |
my $decoded_response = decode("iso-8859-1", $content); | |
say $decoded_response; | |
} | |
{ | |
say "\n-----Start with Latin1; Decode, then encode as UTF-8"; | |
my $decoded_response = decode("iso-8859-1", $content); | |
$decoded_response = encode('UTF-8', $decoded_response); | |
say $decoded_response; | |
} | |
__END__ | |
-----Start with UTF-8; Do nothing | |
Nóirín Plunkett | |
-----Start with UTF-8; Encode only | |
NóirÃn Plunkett | |
-----Start with UTF-8; Decode, then encode | |
N�ir�n Plunkett | |
-----Start with UTF-8; Decode only | |
N�ir�n Plunkett | |
-----Start with UTF-8; Decode only | |
N�ir�n Plunkett | |
-----Start with Latin1; Do nothing | |
Nóirín Plunkett | |
-----Start with Latin1; Decode only | |
Nóirín Plunkett | |
-----Start with Latin1; Decode, then encode as UTF-8 | |
NóirÃn Plunkett |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment