Skip to content

Instantly share code, notes, and snippets.

@makamaka
Created April 28, 2010 08:55
Show Gist options
  • Save makamaka/381890 to your computer and use it in GitHub Desktop.
Save makamaka/381890 to your computer and use it in GitHub Desktop.
#!perl
# benchmark encoding detection for JSON
use strict;
use Modern::Perl;
use Benchmark qw(:all);
use utf8;
use Encode;
# >>> copied from Mojo::JSON
my $UTF_PATTERNS = {
"\0\0\0[^\0]" => 'UTF-32BE',
"\0[^\0]\0[^\0]" => 'UTF-16BE',
"[^\0]\0\0\0" => 'UTF-32LE',
"[^\0]\0[^\0]\0" => 'UTF-16LE'
};
my $BOM_RE = qr/
(?:
\357\273\277 # UTF-8
|
\377\376\0\0 # UTF-32LE
|
\0\0\376\377 # UTF-32BE
|
\376\377 # UTF-16BE
|
\377\376 # UTF-16LE
)
/x;
# <<< copied from Mojo::JSON::XS
my $utf8 = q|{"foo":"bar"}|;
my $utf16 = Encode::encode( 'UTF-16', Encode::decode( 'utf8', $utf8 ) );
my $utf32 = Encode::encode( 'UTF-32', Encode::decode( 'utf8', $utf8 ) );
my $utf16be = Encode::encode( 'UTF-16BE', Encode::decode( 'utf8', $utf8 ) );
my $utf32be = Encode::encode( 'UTF-32BE', Encode::decode( 'utf8', $utf8 ) );
my $utf16le = Encode::encode( 'UTF-16LE', Encode::decode( 'utf8', $utf8 ) );
my $utf32le = Encode::encode( 'UTF-32LE', Encode::decode( 'utf8', $utf8 ) );
$utf16 =~ s/^$BOM_RE//go; # for utf-16, utf-32
$utf32 =~ s/^$BOM_RE//go; # for utf-16, utf-32
my $count = 300000;
cmpthese( $count, {
'unpack utf8' => sub { test_unpack( $utf8 ); },
'regexp utf8' => sub { test_regexp( $utf8 ); },
'mojo_json utf8' => sub { test_mojo_json( $utf8 ); },
'hybride utf8' => sub { test_hybride( $utf8 ); },
} );
cmpthese( $count, {
'unpack utf16' => sub { test_unpack( $utf16le ); },
'regexp utf16' => sub { test_regexp( $utf16le ); },
'mojo_json utf16' => sub { test_mojo_json( $utf16le ); },
'hybride utf16' => sub { test_hybride( $utf16le ); },
} );
cmpthese( $count, {
'unpack utf32' => sub { test_unpack( $utf32le ); },
'regexp utf32' => sub { test_regexp( $utf32le ); },
'mojo_json utf32' => sub { test_mojo_json( $utf32le ); },
'hybride utf32' => sub { test_hybride( $utf32le ); },
} );
#
#
#
sub test_mojo_json { # copied and modified from Mojo::JSON
my ( $text ) = @_;
my $encoding = 'UTF-8';
for my $pattern (keys %$UTF_PATTERNS) {
if ($text =~ /^$pattern/) {
return $UTF_PATTERNS->{$pattern};
}
}
return $encoding;
}
sub test_unpack { # copied from JSON::PP
my ( $text ) = @_;
my @octets = unpack('C3', $text);
return ( $octets[0] and $octets[1]) ? 'UTF-8'
: (!$octets[0] and $octets[1]) ? 'UTF-16BE'
: (!$octets[0] and !$octets[1]) ? 'UTF-32BE'
: ( $octets[2] ) ? 'UTF-16LE'
: 'UTF-32LE';
}
sub test_regexp { # regexp version of test_unpack
my ( $text ) = @_;
return 'unknown' unless ( $text =~ /^(?:([^\0][^\0])|(\0[^\0])|(\0\0)|..(.))/ );
return $1 ? 'UTF-8'
: $2 ? 'UTF-16BE'
: $3 ? 'UTF-32BE'
: $4 eq "\0" ? 'UTF-32LE'
: 'UTF-16LE';
}
sub test_hybride { # regexp and unpack
my ( $text ) = @_;
if ( $text =~ /^(?:([^\0][^\0])|(\0[^\0])|(\0\0))/ ) {
return $1 ? 'UTF-8'
: $2 ? 'UTF-16BE'
: 'UTF-32BE';
}
my @octets = unpack('C3', $text);
return $octets[2] ? 'UTF-16LE' : 'UTF-32LE';
}
__END__
Rate mojo_json utf8 regexp utf8 hybride utf8 unpack utf8
mojo_json utf8 31780/s -- -94% -94% -94%
regexp utf8 526316/s 1556% -- -4% -7%
hybride utf8 545455/s 1616% 4% -- -4%
unpack utf8 566038/s 1681% 8% 4% --
Rate mojo_json utf16 regexp utf16 hybride utf16 unpack utf16
mojo_json utf16 315789/s -- -7% -32% -39%
regexp utf16 340909/s 8% -- -26% -34%
hybride utf16 461538/s 46% 35% -- -11%
unpack utf16 517241/s 64% 52% 12% --
Rate mojo_json utf32 regexp utf32 hybride utf32 unpack utf32
mojo_json utf32 37313/s -- -89% -91% -93%
regexp utf32 329670/s 784% -- -24% -34%
hybride utf32 434783/s 1065% 32% -- -13%
unpack utf32 500000/s 1240% 52% 15% --
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment