Skip to content

Instantly share code, notes, and snippets.

@niczero
Last active November 22, 2016 17:11
Show Gist options
  • Save niczero/0c95bacd9e85203f7de2c4f41b6c63df to your computer and use it in GitHub Desktop.
Save niczero/0c95bacd9e85203f7de2c4f41b6c63df to your computer and use it in GitHub Desktop.
Translating utf8 byte strings
# Steal bits, but don't take at face value
# Originally stolen from https://gist.github.com/nicwolff/1484073
sub _decode {
my ($self, $in) = @_;
my $log = $self->app->log;
my $out = '';
use bytes; # Who said C was dead?
my ($i, $length) = (0, length $in);
while ($i < $length) {
my $b = substr $in, $i, 1;
if (ord $b < 0x80) {
# 7-bit ascii
$out .= $b;
}
elsif (ord($b) >> 5 == 6 and $i + 1 < $length - 1) {
# Possible 2-byte char
my $next = substr $in, $i + 1, 1;
if (ord($next) >> 6 == 2) {
$out .= $b . $next;
++$i;
}
else {
$out .= decode 'cp1252', $b;
}
}
elsif (ord($b) >> 4 == 14 and $i + 2 < $length - 1) {
# Possible 3-byte char
my $next = substr $in, $i + 1, 1;
my $nextagain = substr $in, $i + 2, 1;
if (ord($next) >> 6 == 2 and ord($nextagain) >> 6 == 2) {
$out .= $b . $next . $nextagain;
$i += 2;
}
else {
$out .= decode 'cp1252', $b;
}
}
else {
$out .= decode 'cp1252', $b;
}
++$i;
}
return $out;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment