Skip to content

Instantly share code, notes, and snippets.

@dtonhofer
Last active May 7, 2019 10:32
Show Gist options
  • Save dtonhofer/29c8d561c911cc93052f2bb2181ee75e to your computer and use it in GitHub Desktop.
Save dtonhofer/29c8d561c911cc93052f2bb2181ee75e to your computer and use it in GitHub Desktop.
Comparing behaviour of Perl Data::Dumper when using "Pure Perl" and "XS" mode for non-iso-8859-1 codepoints
#!/usr/bin/perl
# ===
# Testing what Perl's Data::Dumper does with "high" characters e.g.
#
# å -> iso-8859-1 : 0xE5
# Unicode UTF-16 : 0x00E5
# Unicode UTF-8 : 0xC3A5
#
# See also:
#
# https://stackoverflow.com/questions/50489062/how-to-display-readable-utf-8-strings-with-datadumper
#
# Note that the "UTF-8" pragma is on.
# The character 'å' is encoded in UTF-8 *in this program file* and
# the pragma tells Perl that this is so!
#
# We also tell Perl that STDERR and STDOUT and the test files to write
# are/shall-be UTF-8 encoded.
#
# We find:
# ========
#
# The string:
#
# 'Nuuk (Godthåb)'
#
# - is written by Data::Dumper, pure Perl implementation, as UTF-8 (not ISO-8559-1)
# but for higher characters, e.g. "Ч" (Cyrillic Che), the implementation switches
# to ASCII-based escaping (perl string escaping) of UTF-16: "\x{427}" (Unicode 0x427)
#
# - is written by Data::Dumper, XS implementation, as ASCII-based escaping of
# ISO-8859-1 (not UTF-8): "\x{e5}". For higher characters, the implementation
# switches to ASCII-based escaping of UTF-16.
# The implementation seems to eagerly escape anything beyong 7 bit.
#
# In both cases, reading back as UTF-8 works!
# ===
use strict;
use warnings;
use utf8; # Meaning "This lexical scope (i.e. file) contains utf8"
use File::Temp qw(tempfile tempdir);
use File::Spec::Functions qw(catfile);
use Data::Dumper;
# ---
# To print accented chars correctly to STDERR/STDOUT, supposed to be in UTF-8.
# https://perldoc.perl.org/perlunifaq.html
# ---
binmode STDERR, ':encoding(UTF-8)';
binmode STDOUT, ':encoding(UTF-8)';
# ---
# Just call dat main!
# ---
_main();
# ===
# 1) Define data
# 2) Create temporary directory
# 3) For Perl and XS implementation of Data::Dumper:
# 1) Dump data to file in said temporary directory
# 2) Read file back and eval it
# 3) Compare original data and data resulting from eval
# ===
sub _main {
my $data = { EGKK => "London Gatwick"
,BGAA => "Aasiaat (Egedesminde)"
,BGSF => "Kangerlussuaq (Søndre Strømfjord)"
,BGGH => "Nuuk (Godthåb)"
,USHQ => "Белоя́рский"
,USCC => "Челя́бинск"
,TFFR => "Aéroport Guadeloupe - Pôle Caraïbes"
,BKPR => "Aeroporti Ndërkombëtar i Prishtinës 'Adem Jashari'"
};
determineUtf8Flags($data);
my $outdir = makeTmpDir();
# $usePerl = 1 --> use pure Perl implementation
# $usePerl = 0 --> use XS implementation
my $names = { 1 => 'pure_perl', 0 => 'xs' };
for my $usePerl ( qw( 0 1 ) ) {
my $fqfn = makeFullyQualifiedFilename($outdir,$$names{$usePerl});
{
open(my $fh,">:encoding(UTF-8)", $fqfn) || die "Could not open file '$fqfn' for writing: $!";
$$data{used} = "Data::Dumper, $$names{$usePerl}";
$$data{file} = $fqfn;
print $fh Data::Dumper->new([$data])->Useperl($usePerl)->Purity(1)->Sortkeys(1)->Dump;
close $fh || die "Could not close file '$fqfn' after writing: $!"
}
my $reData = slurpAndEval($fqfn,"data");
determineUtf8Flags($reData);
for my $key (sort keys %$data) {
next if ($key eq 'used' || $key eq 'data' || $key =~ /utf8/);
my $orig = $$data{$key};
die "No key '$key' in data extracted from '$fqfn'" unless exists $$reData{$key};
my $reValue = $$reData{$key};
if ($reValue ne $orig) {
print STDERR "Key '$key': Previously '$orig', afterwards '$reValue'\n"
}
else {
print STDERR "Key '$key': No change\n"
}
}
}
print STDERR "Running a 'diff --side-by-side'!\n";
system ("diff", "--side-by-side", makeFullyQualifiedFilename($outdir, $$names{0}), makeFullyQualifiedFilename($outdir, $$names{1}));
}
sub makeTmpDir {
my $outdir = tempdir("test_XXXX", DIR => '/tmp') || die "Could not create temporary directory: $!";
print STDERR "Output goes to files in directory '$outdir' (this directory will not be automatically removed later!)\n";
return $outdir
}
sub makeFullyQualifiedFilename {
my($dir,$impl) = @_;
return catfile($dir,"$impl.dump")
}
sub slurpAndEval {
my($fn,$name) = @_;
my $txt;
{
open(my $fh, '<:encoding(UTF-8)', $fn) or die "Could not open file '$fn' for reading: $!";
# https://perlmaven.com/slurp
# - undefine the record terminator to NOT break apart input!
# - make sure this is a local variable so as not to stress anyone else
local $/ = undef;
$txt = <$fh>;
close $fh;
# redefine the record terminator to be '\n' (n.b. this must be a string, not a character!!!)
$/ = "\n";
}
# Danger Will Robinson!! We are using EVAL, so the data better be gud (i.e. not include a call to rm -rf for example)!
# Assume the text to eval assigns $VAR1
# >>>
my $VAR1;
eval($txt);
# <<<
die "Error in eval of $name content from file '$fn': $@" unless $VAR1;
my $len = scalar (keys %$VAR1);
print STDERR "Read '$name' content from file '$fn' ($len elements found in undumped hash)\n";
return $VAR1
}
sub determineUtf8Flags {
my($data) = @_;
for my $key (sort keys %$data) {
next if ($key eq 'used' || $key eq 'data' || $key =~ /utf8/);
my $str = $$data{$key};
my $val;
if (utf8::is_utf8($str)) { $val = 'yes' } else { $val = 'no' }
$$data{"${key}_utf8"} = $val
}
}
@dtonhofer
Copy link
Author

test script updated a bit

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment