Created
January 28, 2010 03:24
-
-
Save pjlsergeant/288415 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package FixEncodings; | |
use strict; | |
use warnings; | |
require Exporter; | |
our @ISA = qw(Exporter); | |
our @EXPORT_OK = qw(explain attempt_decode); | |
# This should be in terms of longest first | |
our @known_bad = ( | |
# Garbage included | |
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{9d}" => "-" ], # EM DASH | |
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{93}" => "-" ], # EN DASH | |
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{a2}\x{e2}\x{80}\x{9e}\x{c2}\x{a2}" => "\x{2019}" ], # RIGHT SINGLE QUOTATION MARK | |
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{85}\x{e2}\x{80}\x{9c}" => "\x{201c}" ], # LEFT DOUBLE QUOTATION MARK | |
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{a1}\x{c3}\x{82}\x{c2}\x{ac}" => "\x{20ac}" ], # EURO SIGN | |
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{82}\x{c2}\x{9d}" => "\x{201d}" ], # RIGHT DOUBLE QUOTATION MARK | |
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{82}\x{e2}\x{82}\x{ac}\x{c3}\x{82}\x{c2}\x{a6}" => ' '], # Random garbage | |
["\x{c3}\x{83}\x{c2}\x{af}\x{c3}\x{82}\x{c2}\x{bf}\x{c3}\x{82}\x{c2}\x{bd}" => "\x{A3}" ], # POUND SIGN | |
["\x{c3}\x{83}\x{e2}\x{80}\x{9a}\x{c3}\x{82}\x{c2}\x{a3}" => "\x{A3}" ], # POUND SIGN | |
# CP-1252 misconversions. | |
# See: http://www.caswenson.com/past/2007/11/29/painful_sed_script_of_wordpress_doom/ | |
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{aa}" => "\x{ea}" ], # LATIN SMALL LETTER E WITH CIRCUMFLEX | |
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{b3}" => "\x{f3}" ], # LATIN SMALL LETTER O WITH ACUTE | |
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{b6}" => "\x{f6}" ], # LATIN SMALL LETTER O WITH DIAERESIS | |
["\x{c3}\x{a2}\x{cb}\x{86}\x{e2}\x{80}\x{9a}" => " " ], # Newline? | |
["\x{c3}\x{a2}\x{e2}\x{80}\x{93}\x{c2}\x{a0}" => " " ], # Newline? | |
["\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{a4}" => "\x{A3}" ], # POUND SIGN | |
["\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}" => "\x{20ac}" ], # EURO SIGN | |
["\x{c3}\x{a2}\x{e2}\x{80}\x{9e}\x{c2}\x{a2}" => "\x{2122}" ], # TRADE MARK SIGN | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{9d}" => "\x{201d}" ], # RIGHT DOUBLE QUOTATION MARK | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{a2}" => "\x{2022}" ], # BULLET | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{a6}" => "\x{2026}" ], # HORIZONTAL ELLIPSIS | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{a8}" => ' ' ], # LINE SEPARATOR | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{b9}" => "\x{2039}" ], # SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{ba}" => "\x{203a}" ], # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{93}" => "\x{201c}" ], # LEFT DOUBLE QUOTATION MARK | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{a1}" => "\x{201a}" ], # SINGLE LOW-9 QUOTATION MARK | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{be}" => "\x{201e}" ], # DOUBLE LOW-9 QUOTATION MARK | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{cb}\x{9c}" => "\x{2018}" ], # LEFT SINGLE QUOTATION MARK | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{80}\x{98}" => "-" ], # NON-BREAKING HYPHEN | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{80}\x{9c}" => "-" ], # EN DASH | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{80}\x{9d}" => "-" ], # EM DASH | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{80}\x{b0}" => " " ], # THIN SPACE | |
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{84}\x{a2}" => "\x{2019}" ], # RIGHT SINGLE QUOTATION MARK | |
# Unknown encoding | |
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{a4}" => "\x{e4}" ], # LATIN SMALL LETTER A WITH DIAERESIS | |
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{a5}" => "\x{e5}" ], # LATIN SMALL LETTER A WITH RING ABOVE | |
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{a9}" => "\x{e9}" ], # LATIN SMALL LETTER E WITH ACUTE | |
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{bc}" => "\x{fc}" ], # LATIN SMALL LETTER U WITH DIAERESIS | |
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c5}\x{b8}" => "\x{df}" ], # LATIN SMALL LETTER SHARP S | |
# Misc - inexplicably broken but common | |
["\x{c3}\x{a2}\x{e2}\x{80}\x{94}\x{c2}\x{8f}" => "\x{2122}" ], # TRADE MARK SIGN | |
["\x{c3}\x{a2}\x{e2}\x{80}\x{b0}\x{c2}\x{a5}" => "" ], # Apparently blank space | |
["\x{c3}\x{af}\x{c2}\x{bf}\x{c2}\x{bd}" => "\x{A3}" ], # Usually a pound sign, sometimes eacute, sometimes an apostrophe | |
# Double-encoded three-byte Unicode | |
["\x{c3}\x{82}\x{e2}\x{80}\x{93}" => "-" ], # EM DASH | |
["\x{c3}\x{82}\x{e2}\x{80}\x{94}" => "-" ], # EM DASH | |
["\x{c3}\x{82}\x{e2}\x{80}\x{98}" => "\x{2018}" ], # LEFT SINGLE QUOTATION MARK | |
["\x{c3}\x{82}\x{e2}\x{80}\x{99}" => "\x{2019}" ], # RIGHT SINGLE QUOTATION MARK | |
["\x{c3}\x{82}\x{e2}\x{80}\x{9c}" => "\x{201c}" ], # LEFT DOUBLE QUOTATION MARK | |
["\x{c3}\x{82}\x{e2}\x{80}\x{9d}" => "\x{201d}" ], # RIGHT DOUBLE QUOTATION MARK | |
["\x{c3}\x{82}\x{e2}\x{80}\x{9e}" => "\x{201c}" ], # LEFT DOUBLE QUOTATION MARK | |
["\x{c3}\x{82}\x{e2}\x{80}\x{a6}" => "\x{2026}" ], # HORIZONTAL ELLIPSIS | |
["\x{c3}\x{82}\x{e2}\x{82}\x{ac}" => "\x{20ac}" ], # EURO SIGN | |
["\x{c3}\x{82}\x{e2}\x{84}\x{a2}" => "\x{2122}" ], # TRADE MARK SIGN | |
["\x{c3}\x{83}\x{e2}\x{80}\x{93}" => "\x{d6}" ], # LATIN CAPITAL LETTER O WITH DIAERESIS | |
["\x{c3}\x{83}\x{e2}\x{80}\x{98}" => "\x{d1}" ], # LATIN CAPITAL LETTER N WITH TILDE | |
["\x{c3}\x{83}\x{e2}\x{80}\x{9a}" => "" ], # Consistent pervsion that comes before an ' | |
["\x{c3}\x{83}\x{e2}\x{80}\x{9c}" => "\x{d3}" ], # LATIN CAPITAL LETTER O WITH ACUTE | |
["\x{c3}\x{83}\x{e2}\x{80}\x{9e}" => "\x{c4}" ], # LATIN CAPITAL LETTER A WITH DIAERESIS | |
["\x{c3}\x{83}\x{e2}\x{80}\x{a6}" => "\x{c5}" ], # LATIN CAPITAL LETTER A WITH RING ABOVE | |
["\x{c3}\x{83}\x{e2}\x{80}\x{b0}" => "\x{c9}" ], # LATIN CAPITAL LETTER E WITH ACUTE | |
["\x{c3}\x{83}\x{e2}\x{82}\x{ac}" => "\x{e0}" ], # LATIN SMALL LETTER A WITH GRAVE | |
["\x{c3}\x{83}\x{e2}\x{84}\x{a2}" => "\x{d9}" ], # LATIN CAPITAL LETTER U WITH GRAVE | |
["\x{c3}\x{85}\x{e2}\x{80}\x{9c}" => "\x{0153}" ], # LATIN SMALL LIGATURE OE | |
# Double-encoded Unicode | |
["\x{c3}\x{82}\x{c2}\x{a0}" => " " ], # NO-BREAK SPACE | |
["\x{c3}\x{82}\x{c2}\x{a1}" => "\x{a1}" ], # INVERTED EXCLAMATION MARK | |
["\x{c3}\x{82}\x{c2}\x{a3}" => "\x{a3}" ], # POUND SIGN | |
["\x{c3}\x{82}\x{c2}\x{a4}" => "\x{a4}" ], # CURRENCY SIGN | |
["\x{c3}\x{82}\x{c2}\x{a7}" => "\x{a7}" ], # SECTION SIGN | |
["\x{c3}\x{82}\x{c2}\x{a9}" => "\x{2122}" ], # TRADE MARK SIGN | |
["\x{c3}\x{82}\x{c2}\x{aa}" => "\x{aa}" ], # FEMININE ORDINAL INDICATOR | |
["\x{c3}\x{82}\x{c2}\x{ab}" => "\x{ab}" ], # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK | |
["\x{c3}\x{82}\x{c2}\x{ac}" => "\x{ac}" ], # NOT SIGN | |
["\x{c3}\x{82}\x{c2}\x{ad}" => "\x{ad}" ], # SOFT HYPHEN | |
["\x{c3}\x{82}\x{c2}\x{ae}" => "\x{ae}" ], # REGISTERED SIGN | |
["\x{c3}\x{82}\x{c2}\x{b0}" => "\x{b0}" ], # DEGREE SIGN | |
["\x{c3}\x{82}\x{c2}\x{b1}" => "\x{b1}" ], # PLUS-MINUS SIGN | |
["\x{c3}\x{82}\x{c2}\x{b2}" => "\x{b2}" ], # SUPERSCRIPT TWO | |
["\x{c3}\x{82}\x{c2}\x{b4}" => "\x{b4}" ], # ACUTE ACCENT | |
["\x{c3}\x{82}\x{c2}\x{b7}" => "\x{b7}" ], # MIDDLE DOT | |
["\x{c3}\x{82}\x{c2}\x{b9}" => "\x{b9}" ], # SUPERSCRIPT ONE | |
["\x{c3}\x{82}\x{c2}\x{ba}" => "\x{b0}" ], # DEGREE SIGN | |
["\x{c3}\x{82}\x{c2}\x{bb}" => "\x{bb}" ], # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK | |
["\x{c3}\x{82}\x{c2}\x{bc}" => "\x{bc}" ], # VULGAR FRACTION ONE QUARTER | |
["\x{c3}\x{82}\x{c2}\x{bd}" => "\x{bd}" ], # VULGAR FRACTION ONE HALF | |
["\x{c3}\x{82}\x{c2}\x{be}" => "\x{be}" ], # VULGAR FRACTION THREE QUARTERS | |
["\x{c3}\x{82}\x{c2}\x{bf}" => "\x{bf}" ], # INVERTED QUESTION MARK | |
["\x{c3}\x{82}\x{c5}\x{93}" => "\x{0153}" ], # LATIN SMALL LIGATURE OE | |
["\x{c3}\x{83}\x{c2}\x{81}" => "\x{c1}" ], # LATIN CAPITAL LETTER A WITH ACUTE | |
["\x{c3}\x{83}\x{c2}\x{8d}" => "\x{cd}" ], # LATIN CAPITAL LETTER I WITH ACUTE | |
["\x{c3}\x{83}\x{c2}\x{a0}" => "\x{e0}" ], # LATIN SMALL LETTER A WITH GRAVE | |
["\x{c3}\x{83}\x{c2}\x{a1}" => "\x{e1}" ], # LATIN SMALL LETTER A WITH ACUTE | |
["\x{c3}\x{83}\x{c2}\x{a2}" => "\x{e2}" ], # LATIN SMALL LETTER A WITH CIRCUMFLEX | |
["\x{c3}\x{83}\x{c2}\x{a3}" => "\x{e3}" ], # LATIN SMALL LETTER A WITH TILDE | |
["\x{c3}\x{83}\x{c2}\x{a4}" => "\x{e4}" ], # LATIN SMALL LETTER A WITH DIAERESIS | |
["\x{c3}\x{83}\x{c2}\x{a5}" => "\x{e5}" ], # LATIN SMALL LETTER A WITH RING ABOVE | |
["\x{c3}\x{83}\x{c2}\x{a6}" => "\x{e6}" ], # LATIN SMALL LETTER AE | |
["\x{c3}\x{83}\x{c2}\x{a7}" => "\x{e7}" ], # LATIN SMALL LETTER C WITH CEDILLA | |
["\x{c3}\x{83}\x{c2}\x{a8}" => "\x{e8}" ], # LATIN SMALL LETTER E WITH GRAVE | |
["\x{c3}\x{83}\x{c2}\x{a9}" => "\x{e9}" ], # LATIN SMALL LETTER E WITH ACUTE | |
["\x{c3}\x{83}\x{c2}\x{aa}" => "\x{ea}" ], # LATIN SMALL LETTER E WITH CIRCUMFLEX | |
["\x{c3}\x{83}\x{c2}\x{ab}" => "\x{eb}" ], # LATIN SMALL LETTER E WITH DIAERESIS | |
["\x{c3}\x{83}\x{c2}\x{ac}" => "\x{ec}" ], # LATIN SMALL LETTER I WITH GRAVE | |
["\x{c3}\x{83}\x{c2}\x{ad}" => "\x{ed}" ], # LATIN SMALL LETTER I WITH ACUTE | |
["\x{c3}\x{83}\x{c2}\x{ae}" => "\x{ee}" ], # LATIN SMALL LETTER I WITH CIRCUMFLEX | |
["\x{c3}\x{83}\x{c2}\x{af}" => "\x{ef}" ], # LATIN SMALL LETTER I WITH DIAERESIS | |
["\x{c3}\x{83}\x{c2}\x{b1}" => "\x{f1}" ], # LATIN SMALL LETTER N WITH TILDE | |
["\x{c3}\x{83}\x{c2}\x{b2}" => "\x{f2}" ], # LATIN SMALL LETTER O WITH GRAVE | |
["\x{c3}\x{83}\x{c2}\x{b3}" => "\x{f3}" ], # LATIN SMALL LETTER O WITH ACUTE | |
["\x{c3}\x{83}\x{c2}\x{b4}" => "\x{f4}" ], # LATIN SMALL LETTER O WITH CIRCUMFLEX | |
["\x{c3}\x{83}\x{c2}\x{b5}" => "\x{f5}" ], # LATIN SMALL LETTER O WITH TILDE | |
["\x{c3}\x{83}\x{c2}\x{b6}" => "\x{f6}" ], # LATIN SMALL LETTER O WITH DIAERESIS | |
["\x{c3}\x{83}\x{c2}\x{b8}" => "\x{f8}" ], # LATIN SMALL LETTER O WITH STROKE | |
["\x{c3}\x{83}\x{c2}\x{b9}" => "\x{f9}" ], # LATIN SMALL LETTER U WITH GRAVE | |
["\x{c3}\x{83}\x{c2}\x{ba}" => "\x{fa}" ], # LATIN SMALL LETTER U WITH ACUTE | |
["\x{c3}\x{83}\x{c2}\x{bb}" => "\x{fb}" ], # LATIN SMALL LETTER U WITH CIRCUMFLEX | |
["\x{c3}\x{83}\x{c2}\x{bc}" => "\x{fc}" ], # LATIN SMALL LETTER U WITH DIAERESIS | |
["\x{c3}\x{83}\x{c5}\x{93}" => "\x{dc}" ], # LATIN CAPITAL LETTER U WITH DIAERESIS | |
["\x{c3}\x{83}\x{c5}\x{93}" => "\x{dc}" ], # LATIN CAPITAL LETTER U WITH DIAERESIS | |
["\x{c3}\x{83}\x{c5}\x{a1}" => "\x{da}" ], # LATIN CAPITAL LETTER U WITH ACUTE | |
["\x{c3}\x{83}\x{c5}\x{b8}" => "\x{DF}" ], # LATIN SMALL LETTER SHARP S | |
["\x{c3}\x{83}\x{c6}\x{92}" => "\x{e1}" ], # LATIN SMALL LETTER A WITH ACUTE | |
["\x{c3}\x{83}\x{cb}\x{86}" => "\x{c8}" ], # LATIN CAPITAL LETTER E WITH GRAVE | |
["\x{c3}\x{83}\x{e2}\x{80}\x{a1}" => "\x{c7}" ], # LATIN CAPITAL LETTER C WITH CEDILLA | |
["\x{c3}\x{85}\x{c2}\x{a1}" => "\x{161}" ], # LATIN SMALL LETTER S WITH CARON | |
); | |
# We want these to appear in the order we've specified them above... | |
my %known_corruptions = map { my ($key, $value) = @$_; utf8::encode($value); $key => $value } @known_bad; | |
my $known_corruptions = join '|', map { $_->[0] } @known_bad; | |
=head2 attempt_decode | |
Returns a copy of the input string, with substitutions applied. This doesn't | |
work with char-strings. | |
=cut | |
my $debug_decode = 0; | |
my $avoid_control = "\x{c3}"; | |
sub attempt_decode { | |
my $string = shift; | |
warn "Received: " . explain($string) if $debug_decode; | |
return unless defined $string; | |
# Remove corruptions | |
$string =~ s/(?<!$avoid_control)($known_corruptions)/$known_corruptions{$1}/ge; | |
warn "Subst'ed: " . explain($string) if $debug_decode; | |
# Decode remaining UTF8 | |
utf8::decode($string); | |
warn "Decoded : " . explain($string) if $debug_decode; | |
return $string; | |
} | |
=head2 explain | |
Returns the input string with high characters encoded | |
=cut | |
sub explain { | |
my $string = shift; | |
my @chars = split(//, $string); | |
my $explain; | |
for my $char ( @chars ) { | |
my $ord = ord($char); | |
if ( $ord > 126 ) { | |
my $hex = sprintf("%x", $ord); | |
$explain .= '\x{' . $hex . '}'; | |
} else { | |
$explain .= $char; | |
} | |
} | |
return $explain; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment