Created
April 28, 2014 05:48
-
-
Save aniruddha-a/11362673 to your computer and use it in GitHub Desktop.
Match CSV files based on a field (name) and merge them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use Parse::CSV; | |
use strict; | |
my %wb2_table = (); | |
my $workbook2_data = Parse::CSV->new ( file => 'merged_workbook2.csv' ); | |
# Note: the header lines are removed in the CSV files for simplicity | |
# Parse one workbook and load it to a hash | |
while (my $ref = $workbook2_data->fetch) { | |
my @line = @$ref; | |
my $name = $line[5]; # 5th field is the name here | |
$name =~ s/[.,]/ /g; # cleanup | |
next if $name =~ /^$/; | |
my @nsplits = split(/ /, lc($name)); # lowecase | |
my $cn = join(' ', sort @nsplits); # canonicalize (sort n join) | |
$cn =~ s/^\s+|\s+$//g; # trim | |
$wb2_table{$cn} = $ref; | |
} | |
# Now load the second workbook and match in the hash | |
my $workbook1_data = Parse::CSV->new( file => ($ARGV[0] or 'merged_workbook1.csv') ); | |
while (my $ref = $workbook1_data->fetch ) { | |
my @line = @$ref; | |
my $name = $line[1]; # field 1 is the name here | |
$name =~ s/[.,]/ /g; # cleanup | |
my @nsplits = split(/ /, lc($name)); # lowercase it and split | |
my $cn = join (' ', sort @nsplits); # canonicalize (sort n join) | |
$cn =~ s/^\s+|\s+$//g; # trim | |
if (exists($wb2_table{$cn})) { # try matching in the hash | |
my $ar = $wb2_table{$cn}; | |
my @line_wb2 = @$ar; | |
$" = '","'; # Ensure quoting is correct | |
print "\"@line\",\""; # the current CSV row | |
print "\"@line_wb2\",\""; # CSV row of the earlier workbook | |
print "\"\n"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment