jjl/normalisecsv.pl

## normalisecsv.pl
use Text::CSV::Encoded;
use Encode qw(encode decode);
use autodie;
use feature ':5.16';
use strict;
use warnings;

sub go {
  my $lineno = 1; # Sadly, using Text::CSV to read the lines (necessary for
                  # correctly handling embedded newlines) breaks $.
  my $csv = Text::CSV::Encoded->new({
    binary => 1,  # Allows for embedded newlines
    encoding_in  => "utf-8", # Because we like non-ASCII bits
    encoding_out => undef,   # If you set this to utf-8, it gives you back a bytestring
  });
  my @errors;
  my @lines;
  while (my $row = $csv->getline(*ARGV)) {
    my @cols = @$row;
    my $tmp = join ",", @cols;
    my $count = $tmp =~ tr/\n//; # Embedded newlines ruin the line count
    my @ret = map {cleanse($_)} @cols;
    $csv->combine(@ret);
    my $transformed = $csv->string;
    my @illegals = illegal($transformed);
    if (@illegals) {
      @lines=();
      push @errors, sprintf("Illegal character '%s' (code: %d (%x) at line %d column %d",$_->[0], $_->[1], $_->[1], $lineno, $_->[2]) foreach (@illegals);
    } else {
      push @lines, $transformed;
    }
    $lineno += ($count +1);
  }
  if (@errors) {
    die(encode('UTF-8',join "\n", @errors) . "\n");
  } else {
    for my $line (@lines) {
      say encode('UTF-8',$line);
    }
  }
}

sub cleanse {
  my $thing = shift;

  chomp $thing;

  # This lot as per http://cpansearch.perl.org/src/BARBIE/Text-Demoroniser-0.06/lib/Text/Demoroniser.pm
  $thing =~ s/\x{201A}/,/g;   # Low-9 single quote
  $thing =~ s/\x{201E}/,,/g;  # Low-9 double quote
  $thing =~ s/\x{2026}/.../g; # Ellipsis
  $thing =~ s/\x{20C6}/^/g;   # Circumflex
  $thing =~ s/\x{2018}/`/g;   # Left Quote
  $thing =~ s/\x{2019}/'/g;   # Right Quote
  $thing =~ s/\x{201C}/,/g;   # Left Double Quote
  $thing =~ s/\x{201D}/,/g;   # Right Double Quote
  $thing =~ s/\x{2022}/,/g;   # Bullet
  $thing =~ s/\x{2013}/-/g;   # En dash
  $thing =~ s/\x{2014}/-/g;   # Em dash
  $thing =~ s/\x{2039}/,/g;   # Left Angle Quote
  $thing =~ s/\x{203A}/,/g;   # Right Angle Quote

  # Other misc crap i've discovered
  $thing =~ s/\x{0092}/'/g; # Private use two - Actually a right smart single quote

  $thing =~ s/\x{0099}//g;  # <Control> - Actually a latin-1 badly encoded Trademark symbol
  $thing =~ s/\x{0096}/-/g; # Start of guarded area - Looks like an em dash
  $thing =~ s/\x{0097}/-/g; # End of guarded area - Looks like an en dash

  # Optional
  $thing =~ s/\x{00AE}//g;   # Registered Trademark
  $thing =~ s/\x{2122}//g;   # Trademark
  $thing =~ s/\n/ /sg;      # Newline

  $thing =~ s/\x{2020}/*/g; # Dagger
  $thing =~ s/\x{A0}/ /g;   # NBSP
  $thing =~ s/^[ \t]+//gm;  # LTrim
  $thing =~ s/[ \t]+$//gm;  # RTrim
  $thing =~ s/[ \t]*\*$//;  # RTrim with asterisk
  $thing =~ s/[ \t]+/ /g;   # Plural spacings + tabs -> single space
#  $thing =~ s/\x{00E9}/e/g; # e acute

  return $thing;
}

sub illegal {
  my $thing = shift;
  my $out_of_band_re = qr/([^\x{0020}-\x{007E}\x{00E9}])/;
  my @ret;
  while ($thing =~ /$out_of_band_re/gm) {
    push @ret, [$1, ord($1), $-[0]];
  }
  return @ret;
}

go();

## zaliases.sh
# Filename to make the gist title normalisecsv.pl to make it easier to find. Github need to fix this.
alias fixnewlines='perl -pe "s/\r\n?/\n/g;"'
alias wincp1252toutf8='iconv -f latin1 -t utf-8'
	use Text::CSV::Encoded;
	use Encode qw(encode decode);
	use autodie;
	use feature ':5.16';
	use strict;
	use warnings;

	sub go {
	my $lineno = 1; # Sadly, using Text::CSV to read the lines (necessary for
	# correctly handling embedded newlines) breaks $.
	my $csv = Text::CSV::Encoded->new({
	binary => 1, # Allows for embedded newlines
	encoding_in => "utf-8", # Because we like non-ASCII bits
	encoding_out => undef, # If you set this to utf-8, it gives you back a bytestring
	});
	my @errors;
	my @lines;
	while (my $row = $csv->getline(*ARGV)) {
	my @cols = @$row;
	my $tmp = join ",", @cols;
	my $count = $tmp =~ tr/\n//; # Embedded newlines ruin the line count
	my @ret = map {cleanse($_)} @cols;
	$csv->combine(@ret);
	my $transformed = $csv->string;
	my @illegals = illegal($transformed);
	if (@illegals) {
	@lines=();
	push @errors, sprintf("Illegal character '%s' (code: %d (%x) at line %d column %d",$_->[0], $_->[1], $_->[1], $lineno, $_->[2]) foreach (@illegals);
	} else {
	push @lines, $transformed;
	}
	$lineno += ($count +1);
	}
	if (@errors) {
	die(encode('UTF-8',join "\n", @errors) . "\n");
	} else {
	for my $line (@lines) {
	say encode('UTF-8',$line);
	}
	}
	}

	sub cleanse {
	my $thing = shift;

	chomp $thing;

	# This lot as per http://cpansearch.perl.org/src/BARBIE/Text-Demoroniser-0.06/lib/Text/Demoroniser.pm
	$thing =~ s/\x{201A}/,/g; # Low-9 single quote
	$thing =~ s/\x{201E}/,,/g; # Low-9 double quote
	$thing =~ s/\x{2026}/.../g; # Ellipsis
	$thing =~ s/\x{20C6}/^/g; # Circumflex
	$thing =~ s/\x{2018}/`/g; # Left Quote
	$thing =~ s/\x{2019}/'/g; # Right Quote
	$thing =~ s/\x{201C}/,/g; # Left Double Quote
	$thing =~ s/\x{201D}/,/g; # Right Double Quote
	$thing =~ s/\x{2022}/,/g; # Bullet
	$thing =~ s/\x{2013}/-/g; # En dash
	$thing =~ s/\x{2014}/-/g; # Em dash
	$thing =~ s/\x{2039}/,/g; # Left Angle Quote
	$thing =~ s/\x{203A}/,/g; # Right Angle Quote

	# Other misc crap i've discovered
	$thing =~ s/\x{0092}/'/g; # Private use two - Actually a right smart single quote

	$thing =~ s/\x{0099}//g; # <Control> - Actually a latin-1 badly encoded Trademark symbol
	$thing =~ s/\x{0096}/-/g; # Start of guarded area - Looks like an em dash
	$thing =~ s/\x{0097}/-/g; # End of guarded area - Looks like an en dash

	# Optional
	$thing =~ s/\x{00AE}//g; # Registered Trademark
	$thing =~ s/\x{2122}//g; # Trademark
	$thing =~ s/\n/ /sg; # Newline

	$thing =~ s/\x{2020}/*/g; # Dagger
	$thing =~ s/\x{A0}/ /g; # NBSP
	$thing =~ s/^[ \t]+//gm; # LTrim
	$thing =~ s/[ \t]+$//gm; # RTrim
	$thing =~ s/[ \t]\$//; # RTrim with asterisk
	$thing =~ s/[ \t]+/ /g; # Plural spacings + tabs -> single space
	# $thing =~ s/\x{00E9}/e/g; # e acute

	return $thing;
	}

	sub illegal {
	my $thing = shift;
	my $out_of_band_re = qr/([^\x{0020}-\x{007E}\x{00E9}])/;
	my @ret;
	while ($thing =~ /$out_of_band_re/gm) {
	push @ret, [$1, ord($1), $-[0]];
	}
	return @ret;
	}

	go();
	# Filename to make the gist title normalisecsv.pl to make it easier to find. Github need to fix this.
	alias fixnewlines='perl -pe "s/\r\n?/\n/g;"'
	alias wincp1252toutf8='iconv -f latin1 -t utf-8'