Skip to content

Instantly share code, notes, and snippets.

@jjl
Last active December 22, 2015 10:58
Show Gist options
  • Save jjl/6462223 to your computer and use it in GitHub Desktop.
Save jjl/6462223 to your computer and use it in GitHub Desktop.
Useful things for cleaning up csv
use Text::CSV::Encoded;
use Encode qw(encode decode);
use autodie;
use feature ':5.16';
use strict;
use warnings;
sub go {
my $lineno = 1; # Sadly, using Text::CSV to read the lines (necessary for
# correctly handling embedded newlines) breaks $.
my $csv = Text::CSV::Encoded->new({
binary => 1, # Allows for embedded newlines
encoding_in => "utf-8", # Because we like non-ASCII bits
encoding_out => undef, # If you set this to utf-8, it gives you back a bytestring
});
my @errors;
my @lines;
while (my $row = $csv->getline(*ARGV)) {
my @cols = @$row;
my $tmp = join ",", @cols;
my $count = $tmp =~ tr/\n//; # Embedded newlines ruin the line count
my @ret = map {cleanse($_)} @cols;
$csv->combine(@ret);
my $transformed = $csv->string;
my @illegals = illegal($transformed);
if (@illegals) {
@lines=();
push @errors, sprintf("Illegal character '%s' (code: %d (%x) at line %d column %d",$_->[0], $_->[1], $_->[1], $lineno, $_->[2]) foreach (@illegals);
} else {
push @lines, $transformed;
}
$lineno += ($count +1);
}
if (@errors) {
die(encode('UTF-8',join "\n", @errors) . "\n");
} else {
for my $line (@lines) {
say encode('UTF-8',$line);
}
}
}
sub cleanse {
my $thing = shift;
chomp $thing;
# This lot as per http://cpansearch.perl.org/src/BARBIE/Text-Demoroniser-0.06/lib/Text/Demoroniser.pm
$thing =~ s/\x{201A}/,/g; # Low-9 single quote
$thing =~ s/\x{201E}/,,/g; # Low-9 double quote
$thing =~ s/\x{2026}/.../g; # Ellipsis
$thing =~ s/\x{20C6}/^/g; # Circumflex
$thing =~ s/\x{2018}/`/g; # Left Quote
$thing =~ s/\x{2019}/'/g; # Right Quote
$thing =~ s/\x{201C}/,/g; # Left Double Quote
$thing =~ s/\x{201D}/,/g; # Right Double Quote
$thing =~ s/\x{2022}/,/g; # Bullet
$thing =~ s/\x{2013}/-/g; # En dash
$thing =~ s/\x{2014}/-/g; # Em dash
$thing =~ s/\x{2039}/,/g; # Left Angle Quote
$thing =~ s/\x{203A}/,/g; # Right Angle Quote
# Other misc crap i've discovered
$thing =~ s/\x{0092}/'/g; # Private use two - Actually a right smart single quote
$thing =~ s/\x{0099}//g; # <Control> - Actually a latin-1 badly encoded Trademark symbol
$thing =~ s/\x{0096}/-/g; # Start of guarded area - Looks like an em dash
$thing =~ s/\x{0097}/-/g; # End of guarded area - Looks like an en dash
# Optional
$thing =~ s/\x{00AE}//g; # Registered Trademark
$thing =~ s/\x{2122}//g; # Trademark
$thing =~ s/\n/ /sg; # Newline
$thing =~ s/\x{2020}/*/g; # Dagger
$thing =~ s/\x{A0}/ /g; # NBSP
$thing =~ s/^[ \t]+//gm; # LTrim
$thing =~ s/[ \t]+$//gm; # RTrim
$thing =~ s/[ \t]*\*$//; # RTrim with asterisk
$thing =~ s/[ \t]+/ /g; # Plural spacings + tabs -> single space
# $thing =~ s/\x{00E9}/e/g; # e acute
return $thing;
}
sub illegal {
my $thing = shift;
my $out_of_band_re = qr/([^\x{0020}-\x{007E}\x{00E9}])/;
my @ret;
while ($thing =~ /$out_of_band_re/gm) {
push @ret, [$1, ord($1), $-[0]];
}
return @ret;
}
go();
# Filename to make the gist title normalisecsv.pl to make it easier to find. Github need to fix this.
alias fixnewlines='perl -pe "s/\r\n?/\n/g;"'
alias wincp1252toutf8='iconv -f latin1 -t utf-8'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment