Skip to content

Instantly share code, notes, and snippets.

@Fil
Created June 8, 2014 10:16
Show Gist options
  • Save Fil/61fc715b9aab8133c603 to your computer and use it in GitHub Desktop.
Save Fil/61fc715b9aab8133c603 to your computer and use it in GitHub Desktop.
charsethero : convert mixed utf-8 and iso-latin content to utf-8
#!/usr/bin/perl
# source: http://www.perlmonks.org/?node_id=642617
#use strict;
# mixed string with ISO 8859-1 und UTF-8:
#my $test_string = "Das Å (auch \"bolle-Å\" genannt, was soviel bedeutet wie \"Kringel-Å\") ist mit der ".
#D force_latin("dänischen Rechtschreibreform von 1948 eingeführt worden.");
#print "Source: $test_string\n";
#Dprint "UTF : ".force_utf8($test_string)."\n";
#pr#Dint "ISO : ".force_latin($test_string)."\n";
#*/
foreach $line ( <STDIN> ) {
print force_utf8($line);
}
sub force_utf8 {
my $string = shift;
$string =~ s/([\xc0-\xdf][\x80-\xbf]{1}|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3}|[\x80-\xff])/&encode_char_utf8($1)/ge;
return $string;
}
sub force_latin {
my $string = shift;
$string =~ s/([\xc0-\xdf][\x80-\xbf]{1}|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3}|[\x80-\xff])/&decode_char_utf8($1)/ge;
return $string;
}
sub encode_char_utf8 {
my $char = shift;
if($char =~ /^([\xc0-\xdf][\x80-\xbf]{1}|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})$/) {
return $char;
}
my $value = ord($char);
return chr(($value>>6) | 0xc0).chr(0x80 | ($value & 0x3f));
}
sub decode_char_utf8 {
my $char = shift;
if($char =~ /^([\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})$/) {
return '';
} elsif($char =~ /^([\xc0-\xdf])([\x80-\xbf])$/) {
my $value = ((ord($1) & 0x1f)<<6)+(ord($2) & 0x3f);
if($value<256) {
return chr($value);
} else {
return '';
}
} else {
return $char;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment