Created
January 28, 2013 22:58
-
-
Save Rhomboid/4660094 to your computer and use it in GitHub Desktop.
Check for invalid UTF-8 in Metafilter infodump
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
use feature qw/say/; | |
use Encode qw/decode/; | |
use Try::Tiny; | |
open my $usernames, '<', 'usernames.txt' or die $!; | |
while(<$usernames>) { | |
s/\r?\n$//; | |
my @fields = split /\t/; | |
try { | |
decode('UTF-8', $fields[2], Encode::FB_CROAK); | |
} catch { | |
(my $quoted_username = $fields[2]) =~ s/([^\x20-\x7e])/sprintf "\\x%02x", ord $1/eg; | |
say "Invalid UTF-8 found in username on line $.: $quoted_username"; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Invalid UTF-8 found in username on line 4864: \xa9 | |
Invalid UTF-8 found in username on line 5408: pi\xf9 | |
Invalid UTF-8 found in username on line 5497: uttar\xf2 | |
Invalid UTF-8 found in username on line 6070: lk\xf2jhk\xf2j | |
Invalid UTF-8 found in username on line 6741: \xa1\xf0\xb6\xcc\xbd\xde--\xb7Q\xbb\xa1 | |
Invalid UTF-8 found in username on line 9569: \xe1\xcc\xc5\xcb\xd3\xc5\xca\xf3\xd4\xc1\xd3\xc0\xcb\xc5\xd7\xc9\xde | |
Invalid UTF-8 found in username on line 9576: \xf4\xd2\xd5\xc8\xc9\xce\xe5\xd7\xc7\xc5\xce\xc9\xca\xef\xcc\xc5\xc7\xcf\xd7\xc9\xde | |
Invalid UTF-8 found in username on line 9578: \xfb\xd5\xd0\xcc\xd1\xcb\xe1\xce\xc1\xd4\xcf\xcc\xc9\xca\xe1\xce\xc1\xd4\xcf\xcc\xd8\xc5\xd7\xc9\xde | |
Invalid UTF-8 found in username on line 9580: \xf3\xd4\xc1\xce\xc9\xd3\xcc\xc1\xd7\xf1\xce\xcb\xcf\xd7\xd3\xcb\xc9\xca | |
Invalid UTF-8 found in username on line 9582: \xf3\xcf\xc2\xcf\xcc\xc5\xd7\xe1\xcc\xc5\xcb\xd3\xc1\xce\xc4\xd2\xf3\xc5\xd2\xc7\xc5\xc5\xd7\xc9\xde | |
Invalid UTF-8 found in username on line 9592: \xe1\xd2\xcd\xc5\xce\xed\xce\xc1\xc3\xc1\xcb\xc1\xce\xd1\xce | |
Invalid UTF-8 found in username on line 9635: \xed\xc1\xd2\xd8\xd1\xce\xe2\xc5\xcc\xc5\xce\xd8\xcb\xc9\xca | |
Invalid UTF-8 found in username on line 9929: \xf0\xcf\xc2\xcf\xcc\xc5\xcc\xcf\xd7\xe1\xcc\xc5\xcb\xd3\xc1\xce\xc4\xd2\xe9\xd7\xc1\xce\xcf\xd7\xc9\xde | |
Invalid UTF-8 found in username on line 10622: \xf7\xcf\xcc\xd8\xc4\xc5\xcd\xc1\xd2\xeb\xc9\xd3\xd3 | |
Invalid UTF-8 found in username on line 10714: \xeb\xc9\xd3\xf3.\xe1. | |
Invalid UTF-8 found in username on line 11066: \xe5\xd7\xc7\xc5\xce\xc9\xca | |
Invalid UTF-8 found in username on line 11513: \xf3\xcf\xc2\xd1\xce\xc9\xce | |
Invalid UTF-8 found in username on line 11627: \xf2\xcf\xda\xd5\xcd\xce\xd9\xca\xe1\xcc\xc5\xcb\xd3\xc1\xce\xc4\xd2\xf7\xcc\xc1\xc4\xc9\xcd\xc9\xd2\xcf\xd7\xc9\xde | |
Invalid UTF-8 found in username on line 11875: \xe5.\xe0.\xee\xc5\xd7\xc9\xc4\xc9\xcd\xcf\xd7 | |
Invalid UTF-8 found in username on line 12010: l\xf6wi | |
Invalid UTF-8 found in username on line 12288: \xae | |
Invalid UTF-8 found in username on line 12293: \xb7 | |
Invalid UTF-8 found in username on line 12295: \xb91 | |
Invalid UTF-8 found in username on line 12298: ( \xb7 )( \xb7 ) | |
Invalid UTF-8 found in username on line 12367: \xd9 | |
Invalid UTF-8 found in username on line 12395: Holden\xb7 | |
Invalid UTF-8 found in username on line 12485: mick\x81 | |
Invalid UTF-8 found in username on line 15985: Oy\xe9ah | |
Invalid UTF-8 found in username on line 18340: \xba\xa4\xf8,\xb8\xb8,\xf8\xa4\xba\xba\xa4\xf8,\xb8\xb8,\xf8\xa4\xba | |
Invalid UTF-8 found in username on line 18548: Mumle G\xe5segg | |
Invalid UTF-8 found in username on line 19218: Axaxaxas Ml\xf6 | |
Invalid UTF-8 found in username on line 21287: heavy metal \xfcmlaut | |
Invalid UTF-8 found in username on line 21950: skj\xf8nn | |
Invalid UTF-8 found in username on line 22324: ND\xa2 | |
Invalid UTF-8 found in username on line 23910: La M\xf4me Piaf | |
Invalid UTF-8 found in username on line 24535: umb\xfa | |
Invalid UTF-8 found in username on line 25649: \xae@ | |
Invalid UTF-8 found in username on line 25944: Se\xf1or Pantalones | |
Invalid UTF-8 found in username on line 26212: the black rabbit of Inl\xe9 | |
Invalid UTF-8 found in username on line 26325: \xd8 | |
Invalid UTF-8 found in username on line 27614: Nathanial H\xf6rnblow\xe9r | |
Invalid UTF-8 found in username on line 27950: n\xedmwunnan | |
Invalid UTF-8 found in username on line 29549: Se\xf1or Grumpus | |
Invalid UTF-8 found in username on line 30334: Qo\xf6lio | |
Invalid UTF-8 found in username on line 31654: ausb\xfcrgern | |
Invalid UTF-8 found in username on line 31780: L'homme arm\xe9 | |
Invalid UTF-8 found in username on line 32085: Z\xe9 Pequeno | |
Invalid UTF-8 found in username on line 32274: \xaebin | |
Invalid UTF-8 found in username on line 32385: nicolas l\xe9onard sadi carnot | |
Invalid UTF-8 found in username on line 33600: mathowie\xa0 | |
Invalid UTF-8 found in username on line 56820: d\xe9soeuvr\xe9e |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment