Skip to content

Instantly share code, notes, and snippets.

@pjlsergeant
Created January 28, 2010 03:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pjlsergeant/288415 to your computer and use it in GitHub Desktop.
Save pjlsergeant/288415 to your computer and use it in GitHub Desktop.
package FixEncodings;
use strict;
use warnings;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(explain attempt_decode);
# This should be in terms of longest first
our @known_bad = (
# Garbage included
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{9d}" => "-" ], # EM DASH
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{93}" => "-" ], # EN DASH
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{a2}\x{e2}\x{80}\x{9e}\x{c2}\x{a2}" => "\x{2019}" ], # RIGHT SINGLE QUOTATION MARK
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{85}\x{e2}\x{80}\x{9c}" => "\x{201c}" ], # LEFT DOUBLE QUOTATION MARK
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{a1}\x{c3}\x{82}\x{c2}\x{ac}" => "\x{20ac}" ], # EURO SIGN
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}\x{c3}\x{82}\x{c2}\x{9d}" => "\x{201d}" ], # RIGHT DOUBLE QUOTATION MARK
["\x{c3}\x{83}\x{c2}\x{a2}\x{c3}\x{82}\x{e2}\x{82}\x{ac}\x{c3}\x{82}\x{c2}\x{a6}" => ' '], # Random garbage
["\x{c3}\x{83}\x{c2}\x{af}\x{c3}\x{82}\x{c2}\x{bf}\x{c3}\x{82}\x{c2}\x{bd}" => "\x{A3}" ], # POUND SIGN
["\x{c3}\x{83}\x{e2}\x{80}\x{9a}\x{c3}\x{82}\x{c2}\x{a3}" => "\x{A3}" ], # POUND SIGN
# CP-1252 misconversions.
# See: http://www.caswenson.com/past/2007/11/29/painful_sed_script_of_wordpress_doom/
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{aa}" => "\x{ea}" ], # LATIN SMALL LETTER E WITH CIRCUMFLEX
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{b3}" => "\x{f3}" ], # LATIN SMALL LETTER O WITH ACUTE
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{b6}" => "\x{f6}" ], # LATIN SMALL LETTER O WITH DIAERESIS
["\x{c3}\x{a2}\x{cb}\x{86}\x{e2}\x{80}\x{9a}" => " " ], # Newline?
["\x{c3}\x{a2}\x{e2}\x{80}\x{93}\x{c2}\x{a0}" => " " ], # Newline?
["\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{a4}" => "\x{A3}" ], # POUND SIGN
["\x{c3}\x{a2}\x{e2}\x{80}\x{9a}\x{c2}\x{ac}" => "\x{20ac}" ], # EURO SIGN
["\x{c3}\x{a2}\x{e2}\x{80}\x{9e}\x{c2}\x{a2}" => "\x{2122}" ], # TRADE MARK SIGN
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{9d}" => "\x{201d}" ], # RIGHT DOUBLE QUOTATION MARK
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{a2}" => "\x{2022}" ], # BULLET
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{a6}" => "\x{2026}" ], # HORIZONTAL ELLIPSIS
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{a8}" => ' ' ], # LINE SEPARATOR
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{b9}" => "\x{2039}" ], # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c2}\x{ba}" => "\x{203a}" ], # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{93}" => "\x{201c}" ], # LEFT DOUBLE QUOTATION MARK
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{a1}" => "\x{201a}" ], # SINGLE LOW-9 QUOTATION MARK
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{c5}\x{be}" => "\x{201e}" ], # DOUBLE LOW-9 QUOTATION MARK
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{cb}\x{9c}" => "\x{2018}" ], # LEFT SINGLE QUOTATION MARK
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{80}\x{98}" => "-" ], # NON-BREAKING HYPHEN
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{80}\x{9c}" => "-" ], # EN DASH
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{80}\x{9d}" => "-" ], # EM DASH
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{80}\x{b0}" => " " ], # THIN SPACE
["\x{c3}\x{a2}\x{e2}\x{82}\x{ac}\x{e2}\x{84}\x{a2}" => "\x{2019}" ], # RIGHT SINGLE QUOTATION MARK
# Unknown encoding
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{a4}" => "\x{e4}" ], # LATIN SMALL LETTER A WITH DIAERESIS
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{a5}" => "\x{e5}" ], # LATIN SMALL LETTER A WITH RING ABOVE
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{a9}" => "\x{e9}" ], # LATIN SMALL LETTER E WITH ACUTE
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c2}\x{bc}" => "\x{fc}" ], # LATIN SMALL LETTER U WITH DIAERESIS
["\x{c3}\x{83}\x{c6}\x{92}\x{c3}\x{82}\x{c5}\x{b8}" => "\x{df}" ], # LATIN SMALL LETTER SHARP S
# Misc - inexplicably broken but common
["\x{c3}\x{a2}\x{e2}\x{80}\x{94}\x{c2}\x{8f}" => "\x{2122}" ], # TRADE MARK SIGN
["\x{c3}\x{a2}\x{e2}\x{80}\x{b0}\x{c2}\x{a5}" => "" ], # Apparently blank space
["\x{c3}\x{af}\x{c2}\x{bf}\x{c2}\x{bd}" => "\x{A3}" ], # Usually a pound sign, sometimes eacute, sometimes an apostrophe
# Double-encoded three-byte Unicode
["\x{c3}\x{82}\x{e2}\x{80}\x{93}" => "-" ], # EM DASH
["\x{c3}\x{82}\x{e2}\x{80}\x{94}" => "-" ], # EM DASH
["\x{c3}\x{82}\x{e2}\x{80}\x{98}" => "\x{2018}" ], # LEFT SINGLE QUOTATION MARK
["\x{c3}\x{82}\x{e2}\x{80}\x{99}" => "\x{2019}" ], # RIGHT SINGLE QUOTATION MARK
["\x{c3}\x{82}\x{e2}\x{80}\x{9c}" => "\x{201c}" ], # LEFT DOUBLE QUOTATION MARK
["\x{c3}\x{82}\x{e2}\x{80}\x{9d}" => "\x{201d}" ], # RIGHT DOUBLE QUOTATION MARK
["\x{c3}\x{82}\x{e2}\x{80}\x{9e}" => "\x{201c}" ], # LEFT DOUBLE QUOTATION MARK
["\x{c3}\x{82}\x{e2}\x{80}\x{a6}" => "\x{2026}" ], # HORIZONTAL ELLIPSIS
["\x{c3}\x{82}\x{e2}\x{82}\x{ac}" => "\x{20ac}" ], # EURO SIGN
["\x{c3}\x{82}\x{e2}\x{84}\x{a2}" => "\x{2122}" ], # TRADE MARK SIGN
["\x{c3}\x{83}\x{e2}\x{80}\x{93}" => "\x{d6}" ], # LATIN CAPITAL LETTER O WITH DIAERESIS
["\x{c3}\x{83}\x{e2}\x{80}\x{98}" => "\x{d1}" ], # LATIN CAPITAL LETTER N WITH TILDE
["\x{c3}\x{83}\x{e2}\x{80}\x{9a}" => "" ], # Consistent pervsion that comes before an '
["\x{c3}\x{83}\x{e2}\x{80}\x{9c}" => "\x{d3}" ], # LATIN CAPITAL LETTER O WITH ACUTE
["\x{c3}\x{83}\x{e2}\x{80}\x{9e}" => "\x{c4}" ], # LATIN CAPITAL LETTER A WITH DIAERESIS
["\x{c3}\x{83}\x{e2}\x{80}\x{a6}" => "\x{c5}" ], # LATIN CAPITAL LETTER A WITH RING ABOVE
["\x{c3}\x{83}\x{e2}\x{80}\x{b0}" => "\x{c9}" ], # LATIN CAPITAL LETTER E WITH ACUTE
["\x{c3}\x{83}\x{e2}\x{82}\x{ac}" => "\x{e0}" ], # LATIN SMALL LETTER A WITH GRAVE
["\x{c3}\x{83}\x{e2}\x{84}\x{a2}" => "\x{d9}" ], # LATIN CAPITAL LETTER U WITH GRAVE
["\x{c3}\x{85}\x{e2}\x{80}\x{9c}" => "\x{0153}" ], # LATIN SMALL LIGATURE OE
# Double-encoded Unicode
["\x{c3}\x{82}\x{c2}\x{a0}" => " " ], # NO-BREAK SPACE
["\x{c3}\x{82}\x{c2}\x{a1}" => "\x{a1}" ], # INVERTED EXCLAMATION MARK
["\x{c3}\x{82}\x{c2}\x{a3}" => "\x{a3}" ], # POUND SIGN
["\x{c3}\x{82}\x{c2}\x{a4}" => "\x{a4}" ], # CURRENCY SIGN
["\x{c3}\x{82}\x{c2}\x{a7}" => "\x{a7}" ], # SECTION SIGN
["\x{c3}\x{82}\x{c2}\x{a9}" => "\x{2122}" ], # TRADE MARK SIGN
["\x{c3}\x{82}\x{c2}\x{aa}" => "\x{aa}" ], # FEMININE ORDINAL INDICATOR
["\x{c3}\x{82}\x{c2}\x{ab}" => "\x{ab}" ], # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
["\x{c3}\x{82}\x{c2}\x{ac}" => "\x{ac}" ], # NOT SIGN
["\x{c3}\x{82}\x{c2}\x{ad}" => "\x{ad}" ], # SOFT HYPHEN
["\x{c3}\x{82}\x{c2}\x{ae}" => "\x{ae}" ], # REGISTERED SIGN
["\x{c3}\x{82}\x{c2}\x{b0}" => "\x{b0}" ], # DEGREE SIGN
["\x{c3}\x{82}\x{c2}\x{b1}" => "\x{b1}" ], # PLUS-MINUS SIGN
["\x{c3}\x{82}\x{c2}\x{b2}" => "\x{b2}" ], # SUPERSCRIPT TWO
["\x{c3}\x{82}\x{c2}\x{b4}" => "\x{b4}" ], # ACUTE ACCENT
["\x{c3}\x{82}\x{c2}\x{b7}" => "\x{b7}" ], # MIDDLE DOT
["\x{c3}\x{82}\x{c2}\x{b9}" => "\x{b9}" ], # SUPERSCRIPT ONE
["\x{c3}\x{82}\x{c2}\x{ba}" => "\x{b0}" ], # DEGREE SIGN
["\x{c3}\x{82}\x{c2}\x{bb}" => "\x{bb}" ], # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
["\x{c3}\x{82}\x{c2}\x{bc}" => "\x{bc}" ], # VULGAR FRACTION ONE QUARTER
["\x{c3}\x{82}\x{c2}\x{bd}" => "\x{bd}" ], # VULGAR FRACTION ONE HALF
["\x{c3}\x{82}\x{c2}\x{be}" => "\x{be}" ], # VULGAR FRACTION THREE QUARTERS
["\x{c3}\x{82}\x{c2}\x{bf}" => "\x{bf}" ], # INVERTED QUESTION MARK
["\x{c3}\x{82}\x{c5}\x{93}" => "\x{0153}" ], # LATIN SMALL LIGATURE OE
["\x{c3}\x{83}\x{c2}\x{81}" => "\x{c1}" ], # LATIN CAPITAL LETTER A WITH ACUTE
["\x{c3}\x{83}\x{c2}\x{8d}" => "\x{cd}" ], # LATIN CAPITAL LETTER I WITH ACUTE
["\x{c3}\x{83}\x{c2}\x{a0}" => "\x{e0}" ], # LATIN SMALL LETTER A WITH GRAVE
["\x{c3}\x{83}\x{c2}\x{a1}" => "\x{e1}" ], # LATIN SMALL LETTER A WITH ACUTE
["\x{c3}\x{83}\x{c2}\x{a2}" => "\x{e2}" ], # LATIN SMALL LETTER A WITH CIRCUMFLEX
["\x{c3}\x{83}\x{c2}\x{a3}" => "\x{e3}" ], # LATIN SMALL LETTER A WITH TILDE
["\x{c3}\x{83}\x{c2}\x{a4}" => "\x{e4}" ], # LATIN SMALL LETTER A WITH DIAERESIS
["\x{c3}\x{83}\x{c2}\x{a5}" => "\x{e5}" ], # LATIN SMALL LETTER A WITH RING ABOVE
["\x{c3}\x{83}\x{c2}\x{a6}" => "\x{e6}" ], # LATIN SMALL LETTER AE
["\x{c3}\x{83}\x{c2}\x{a7}" => "\x{e7}" ], # LATIN SMALL LETTER C WITH CEDILLA
["\x{c3}\x{83}\x{c2}\x{a8}" => "\x{e8}" ], # LATIN SMALL LETTER E WITH GRAVE
["\x{c3}\x{83}\x{c2}\x{a9}" => "\x{e9}" ], # LATIN SMALL LETTER E WITH ACUTE
["\x{c3}\x{83}\x{c2}\x{aa}" => "\x{ea}" ], # LATIN SMALL LETTER E WITH CIRCUMFLEX
["\x{c3}\x{83}\x{c2}\x{ab}" => "\x{eb}" ], # LATIN SMALL LETTER E WITH DIAERESIS
["\x{c3}\x{83}\x{c2}\x{ac}" => "\x{ec}" ], # LATIN SMALL LETTER I WITH GRAVE
["\x{c3}\x{83}\x{c2}\x{ad}" => "\x{ed}" ], # LATIN SMALL LETTER I WITH ACUTE
["\x{c3}\x{83}\x{c2}\x{ae}" => "\x{ee}" ], # LATIN SMALL LETTER I WITH CIRCUMFLEX
["\x{c3}\x{83}\x{c2}\x{af}" => "\x{ef}" ], # LATIN SMALL LETTER I WITH DIAERESIS
["\x{c3}\x{83}\x{c2}\x{b1}" => "\x{f1}" ], # LATIN SMALL LETTER N WITH TILDE
["\x{c3}\x{83}\x{c2}\x{b2}" => "\x{f2}" ], # LATIN SMALL LETTER O WITH GRAVE
["\x{c3}\x{83}\x{c2}\x{b3}" => "\x{f3}" ], # LATIN SMALL LETTER O WITH ACUTE
["\x{c3}\x{83}\x{c2}\x{b4}" => "\x{f4}" ], # LATIN SMALL LETTER O WITH CIRCUMFLEX
["\x{c3}\x{83}\x{c2}\x{b5}" => "\x{f5}" ], # LATIN SMALL LETTER O WITH TILDE
["\x{c3}\x{83}\x{c2}\x{b6}" => "\x{f6}" ], # LATIN SMALL LETTER O WITH DIAERESIS
["\x{c3}\x{83}\x{c2}\x{b8}" => "\x{f8}" ], # LATIN SMALL LETTER O WITH STROKE
["\x{c3}\x{83}\x{c2}\x{b9}" => "\x{f9}" ], # LATIN SMALL LETTER U WITH GRAVE
["\x{c3}\x{83}\x{c2}\x{ba}" => "\x{fa}" ], # LATIN SMALL LETTER U WITH ACUTE
["\x{c3}\x{83}\x{c2}\x{bb}" => "\x{fb}" ], # LATIN SMALL LETTER U WITH CIRCUMFLEX
["\x{c3}\x{83}\x{c2}\x{bc}" => "\x{fc}" ], # LATIN SMALL LETTER U WITH DIAERESIS
["\x{c3}\x{83}\x{c5}\x{93}" => "\x{dc}" ], # LATIN CAPITAL LETTER U WITH DIAERESIS
["\x{c3}\x{83}\x{c5}\x{93}" => "\x{dc}" ], # LATIN CAPITAL LETTER U WITH DIAERESIS
["\x{c3}\x{83}\x{c5}\x{a1}" => "\x{da}" ], # LATIN CAPITAL LETTER U WITH ACUTE
["\x{c3}\x{83}\x{c5}\x{b8}" => "\x{DF}" ], # LATIN SMALL LETTER SHARP S
["\x{c3}\x{83}\x{c6}\x{92}" => "\x{e1}" ], # LATIN SMALL LETTER A WITH ACUTE
["\x{c3}\x{83}\x{cb}\x{86}" => "\x{c8}" ], # LATIN CAPITAL LETTER E WITH GRAVE
["\x{c3}\x{83}\x{e2}\x{80}\x{a1}" => "\x{c7}" ], # LATIN CAPITAL LETTER C WITH CEDILLA
["\x{c3}\x{85}\x{c2}\x{a1}" => "\x{161}" ], # LATIN SMALL LETTER S WITH CARON
);
# We want these to appear in the order we've specified them above...
my %known_corruptions = map { my ($key, $value) = @$_; utf8::encode($value); $key => $value } @known_bad;
my $known_corruptions = join '|', map { $_->[0] } @known_bad;
=head2 attempt_decode
Returns a copy of the input string, with substitutions applied. This doesn't
work with char-strings.
=cut
my $debug_decode = 0;
my $avoid_control = "\x{c3}";
sub attempt_decode {
my $string = shift;
warn "Received: " . explain($string) if $debug_decode;
return unless defined $string;
# Remove corruptions
$string =~ s/(?<!$avoid_control)($known_corruptions)/$known_corruptions{$1}/ge;
warn "Subst'ed: " . explain($string) if $debug_decode;
# Decode remaining UTF8
utf8::decode($string);
warn "Decoded : " . explain($string) if $debug_decode;
return $string;
}
=head2 explain
Returns the input string with high characters encoded
=cut
sub explain {
my $string = shift;
my @chars = split(//, $string);
my $explain;
for my $char ( @chars ) {
my $ord = ord($char);
if ( $ord > 126 ) {
my $hex = sprintf("%x", $ord);
$explain .= '\x{' . $hex . '}';
} else {
$explain .= $char;
}
}
return $explain;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment