Created
March 11, 2016 15:28
-
-
Save trammell/18c5b8510877a51cc22a to your computer and use it in GitHub Desktop.
A short Perl script to remove columns from a CSV file, based on column headers.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
=pod | |
=head1 NAME | |
prune-columns.pl - prune columns from a CSV file | |
=head1 USAGE | |
prune-columns.pl csvfile regex [regex ...] | |
=head1 DESCRIPTION | |
The script reads from the CSV file specified in argument B<csvfile>, flags the | |
columns to be removed in memory, and prints to STDOUT a new CSV file that is | |
missing the indicated columns. | |
The original CSV file is not altered. You'll need to capture the script output | |
to get the new CSV file. | |
=head1 TO DO | |
At some point it might be nice to have code that removes by column index as | |
well, but what I need now is just to remove by pattern. | |
=cut | |
use strict; | |
use warnings FATAL => 'all'; | |
use Data::Dumper; | |
use Pod::Usage; | |
use Text::CSV_XS; | |
# homebrew argument parser, maybe upgrade to Getopt::Long? | |
pod2usage() unless @ARGV; | |
my ($csvfile, @regexes) = @ARGV; | |
pod2usage("CSV file '$csvfile' not found.") unless -f $csvfile; | |
pod2usage("Please specify one or more regular expressions.") unless @regexes; | |
my @columns = columns($csvfile); # my list of CSV columns | |
my %deleted; # empty hash of column indices to "delete" | |
# find indices of columns matching regex(es) | |
foreach my $regex (@regexes) { | |
foreach my $i (0 .. ($#columns - 1)) { | |
my $col = $columns[$i]; | |
$deleted{$i} = $col if $col =~ /$regex/; | |
} | |
} | |
warn Dumper(\%deleted); | |
# construct the "array slice" to apply (keep these columns) | |
my @slice = grep { !$deleted{$_} } 0 .. $#columns - 1; | |
# Read the CSV file line by line and slice out the columns we want to keep as | |
# we print. | |
open my $fh, "<", $csvfile or die "$csvfile: $!"; | |
while (my $row = csv()->getline($fh)) { | |
my @fields = @$row; | |
csv()->print(*STDOUT, [ @fields[@slice] ]) or csv()->error_diag; | |
} | |
close $fh or die $!; | |
my $CSV; | |
sub csv { | |
$CSV ||= Text::CSV_XS->new({ binary => 1, | |
auto_diag => 3, | |
allow_loose_quotes => 1, | |
eol => $/ }); | |
} | |
my $_columns; | |
sub columns { | |
my $csvfile = shift; | |
$_columns ||= do { | |
open(my $fh, '<', $csvfile) or die $!; | |
my @cols = @{ csv()->getline($fh) }; | |
close $fh or die $!; | |
for (@cols) { s/^\s+//; s/\s+$//; } | |
\@cols; | |
}; | |
return @{ $_columns }; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment