Created
January 28, 2016 16:02
-
-
Save alecramsay/23b994714a6d860b7e1d to your computer and use it in GitHub Desktop.
This Perl script eats CR and LF characters embedded within quoted strings in a .csv file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# This Perl script eats CR and LF characters embedded within quoted strings in a .csv file. | |
# It assumes the file uses comma-separated values quoting rules: | |
# https://en.wikipedia.org/wiki/Comma-separated_values#Basic_rules | |
# and should work with any file, .csv or not, that follows them. | |
# TODO: Doesn't support double double-quotes embedded within the double-quoted strings. | |
# Args <input file> | |
use strict; | |
use warnings; | |
my ($file) = @ARGV; | |
# validate $file | |
if (not defined $file) { | |
die "You need to provide a file to process.\n"; | |
} | |
open (my $fh, "<", $file) or die "Can't open file: $!"; | |
my $in_quoted_string = 0; | |
my $quote = '"'; | |
my $lf = '\n'; | |
my $cr = '\r'; | |
my $char; | |
# read until the end of the file | |
until (eof($fh)) { | |
# read a character | |
$char = getc($fh); | |
if ($in_quoted_string == 0) { | |
# in pass-through mode, i.e., not within a quoted string | |
if ($char eq $quote) { | |
# character is the opening quote of a string; enter quoted-string mode | |
$in_quoted_string = 1; | |
} | |
} | |
else { | |
# in quoted-string mode | |
if ($char eq $quote) { | |
# character is the closing quote; exit quoted-string mode | |
$in_quoted_string = 0; | |
} | |
elsif ($char =~ /$lf|$cr/) { | |
# character is an embedded line feed (LF) or carriage return (CR); eat it | |
next; | |
} | |
} | |
# write a character | |
print $char; | |
} | |
close $fh || die "Couldn't close file properly!"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment