Created
June 26, 2012 13:32
-
-
Save robotmay/2995816 to your computer and use it in GitHub Desktop.
Compare multiple CSV files; specify a pattern to match across the files and it will remove duplicates from the first input file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'optparse' | |
require 'csv' | |
@options = { | |
:files => [], | |
:pattern => /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i, | |
:columns => [], | |
:output => nil | |
} | |
@opts = OptionParser.new do |opts| | |
opts.on "-f", "--files FILES", Array, "List of files to compare" do |files| | |
@options[:files] = files.collect do |f| | |
CSV.read File.expand_path(f) | |
end | |
end | |
opts.on "-o", "--output OUTPUT", "Output file" do |output| | |
@options[:output] = File.expand_path(output) | |
end | |
opts.on "-p", "--pattern PATTERN", "Regexp pattern for matching columns" do |pattern| | |
@options[:pattern] = Regexp.new(pattern) | |
end | |
opts.on "-c", "--columns COLUMNS", Array, "Alternative to Regexp: specify the column numbers for matching" do |columns| | |
@options[:columns] = columns.collect(&:to_i) | |
end | |
end | |
@opts.parse!(ARGV) | |
file_row_counts = @options[:files].collect(&:count) | |
primary_file = @options[:files].shift | |
primary_file_column = if @options[:columns].size > 0 | |
@options[:columns].shift | |
else | |
nil | |
end | |
rows_to_delete = [] | |
needle_cache = [] | |
matches = 0 | |
non_matches = 0 | |
def find_needle(row, column_number = nil) | |
needle = if column_number.is_a?(Integer) | |
$stdout.puts "Matching on column number #{column_number}" | |
row[column_number] | |
elsif @options[:pattern].is_a?(Regexp) | |
$stdout.puts "Matching on regexp: #{@options[:pattern]}" | |
row.find { |field| field =~ @options[:pattern] } | |
end | |
(needle || "").strip | |
end | |
primary_file.each_with_index do |row, row_index| | |
needle = find_needle(row, primary_file_column) | |
@options[:files].each_with_index do |file, file_index| | |
needles = (needle_cache[file_index] ||= file.collect { |r| find_needle(r, @options[:columns][file_index]) }) | |
if needles.include?(needle) | |
$stdout.puts "Matching needle: #{needle}" | |
rows_to_delete << row | |
matches += 1 | |
else | |
$stdout.puts "Non-matching needle: #{needle}" | |
non_matches += 1 | |
end | |
end | |
end | |
$stdout.puts "Row counts: #{file_row_counts}" | |
$stdout.puts "Matches: #{matches}" | |
$stdout.puts "Non-matches: #{non_matches}" | |
$stdout.puts "Number to delete: #{rows_to_delete.count}" | |
output_rows = primary_file - rows_to_delete | |
if @options[:output] | |
CSV.open(@options[:output], "wb") do |csv| | |
output_rows.each do |row| | |
csv << row | |
end | |
end | |
$stdout.puts "Writing CSV file: #{@options[:output]}" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment