Last active
December 22, 2015 10:58
-
-
Save jjl/6462223 to your computer and use it in GitHub Desktop.
Useful things for cleaning up csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use Text::CSV::Encoded; | |
use Encode qw(encode decode); | |
use autodie; | |
use feature ':5.16'; | |
use strict; | |
use warnings; | |
sub go { | |
my $lineno = 1; # Sadly, using Text::CSV to read the lines (necessary for | |
# correctly handling embedded newlines) breaks $. | |
my $csv = Text::CSV::Encoded->new({ | |
binary => 1, # Allows for embedded newlines | |
encoding_in => "utf-8", # Because we like non-ASCII bits | |
encoding_out => undef, # If you set this to utf-8, it gives you back a bytestring | |
}); | |
my @errors; | |
my @lines; | |
while (my $row = $csv->getline(*ARGV)) { | |
my @cols = @$row; | |
my $tmp = join ",", @cols; | |
my $count = $tmp =~ tr/\n//; # Embedded newlines ruin the line count | |
my @ret = map {cleanse($_)} @cols; | |
$csv->combine(@ret); | |
my $transformed = $csv->string; | |
my @illegals = illegal($transformed); | |
if (@illegals) { | |
@lines=(); | |
push @errors, sprintf("Illegal character '%s' (code: %d (%x) at line %d column %d",$_->[0], $_->[1], $_->[1], $lineno, $_->[2]) foreach (@illegals); | |
} else { | |
push @lines, $transformed; | |
} | |
$lineno += ($count +1); | |
} | |
if (@errors) { | |
die(encode('UTF-8',join "\n", @errors) . "\n"); | |
} else { | |
for my $line (@lines) { | |
say encode('UTF-8',$line); | |
} | |
} | |
} | |
sub cleanse { | |
my $thing = shift; | |
chomp $thing; | |
# This lot as per http://cpansearch.perl.org/src/BARBIE/Text-Demoroniser-0.06/lib/Text/Demoroniser.pm | |
$thing =~ s/\x{201A}/,/g; # Low-9 single quote | |
$thing =~ s/\x{201E}/,,/g; # Low-9 double quote | |
$thing =~ s/\x{2026}/.../g; # Ellipsis | |
$thing =~ s/\x{20C6}/^/g; # Circumflex | |
$thing =~ s/\x{2018}/`/g; # Left Quote | |
$thing =~ s/\x{2019}/'/g; # Right Quote | |
$thing =~ s/\x{201C}/,/g; # Left Double Quote | |
$thing =~ s/\x{201D}/,/g; # Right Double Quote | |
$thing =~ s/\x{2022}/,/g; # Bullet | |
$thing =~ s/\x{2013}/-/g; # En dash | |
$thing =~ s/\x{2014}/-/g; # Em dash | |
$thing =~ s/\x{2039}/,/g; # Left Angle Quote | |
$thing =~ s/\x{203A}/,/g; # Right Angle Quote | |
# Other misc crap i've discovered | |
$thing =~ s/\x{0092}/'/g; # Private use two - Actually a right smart single quote | |
$thing =~ s/\x{0099}//g; # <Control> - Actually a latin-1 badly encoded Trademark symbol | |
$thing =~ s/\x{0096}/-/g; # Start of guarded area - Looks like an em dash | |
$thing =~ s/\x{0097}/-/g; # End of guarded area - Looks like an en dash | |
# Optional | |
$thing =~ s/\x{00AE}//g; # Registered Trademark | |
$thing =~ s/\x{2122}//g; # Trademark | |
$thing =~ s/\n/ /sg; # Newline | |
$thing =~ s/\x{2020}/*/g; # Dagger | |
$thing =~ s/\x{A0}/ /g; # NBSP | |
$thing =~ s/^[ \t]+//gm; # LTrim | |
$thing =~ s/[ \t]+$//gm; # RTrim | |
$thing =~ s/[ \t]*\*$//; # RTrim with asterisk | |
$thing =~ s/[ \t]+/ /g; # Plural spacings + tabs -> single space | |
# $thing =~ s/\x{00E9}/e/g; # e acute | |
return $thing; | |
} | |
sub illegal { | |
my $thing = shift; | |
my $out_of_band_re = qr/([^\x{0020}-\x{007E}\x{00E9}])/; | |
my @ret; | |
while ($thing =~ /$out_of_band_re/gm) { | |
push @ret, [$1, ord($1), $-[0]]; | |
} | |
return @ret; | |
} | |
go(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Filename to make the gist title normalisecsv.pl to make it easier to find. Github need to fix this. | |
alias fixnewlines='perl -pe "s/\r\n?/\n/g;"' | |
alias wincp1252toutf8='iconv -f latin1 -t utf-8' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment