Skip to content

Instantly share code, notes, and snippets.

@robotmay
Created June 26, 2012 13:32
Show Gist options
  • Save robotmay/2995816 to your computer and use it in GitHub Desktop.
Save robotmay/2995816 to your computer and use it in GitHub Desktop.
Compare multiple CSV files; specify a pattern to match across the files and it will remove duplicates from the first input file.
#!/usr/bin/env ruby
require 'optparse'
require 'csv'
@options = {
:files => [],
:pattern => /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i,
:columns => [],
:output => nil
}
@opts = OptionParser.new do |opts|
opts.on "-f", "--files FILES", Array, "List of files to compare" do |files|
@options[:files] = files.collect do |f|
CSV.read File.expand_path(f)
end
end
opts.on "-o", "--output OUTPUT", "Output file" do |output|
@options[:output] = File.expand_path(output)
end
opts.on "-p", "--pattern PATTERN", "Regexp pattern for matching columns" do |pattern|
@options[:pattern] = Regexp.new(pattern)
end
opts.on "-c", "--columns COLUMNS", Array, "Alternative to Regexp: specify the column numbers for matching" do |columns|
@options[:columns] = columns.collect(&:to_i)
end
end
@opts.parse!(ARGV)
file_row_counts = @options[:files].collect(&:count)
primary_file = @options[:files].shift
primary_file_column = if @options[:columns].size > 0
@options[:columns].shift
else
nil
end
rows_to_delete = []
needle_cache = []
matches = 0
non_matches = 0
def find_needle(row, column_number = nil)
needle = if column_number.is_a?(Integer)
$stdout.puts "Matching on column number #{column_number}"
row[column_number]
elsif @options[:pattern].is_a?(Regexp)
$stdout.puts "Matching on regexp: #{@options[:pattern]}"
row.find { |field| field =~ @options[:pattern] }
end
(needle || "").strip
end
primary_file.each_with_index do |row, row_index|
needle = find_needle(row, primary_file_column)
@options[:files].each_with_index do |file, file_index|
needles = (needle_cache[file_index] ||= file.collect { |r| find_needle(r, @options[:columns][file_index]) })
if needles.include?(needle)
$stdout.puts "Matching needle: #{needle}"
rows_to_delete << row
matches += 1
else
$stdout.puts "Non-matching needle: #{needle}"
non_matches += 1
end
end
end
$stdout.puts "Row counts: #{file_row_counts}"
$stdout.puts "Matches: #{matches}"
$stdout.puts "Non-matches: #{non_matches}"
$stdout.puts "Number to delete: #{rows_to_delete.count}"
output_rows = primary_file - rows_to_delete
if @options[:output]
CSV.open(@options[:output], "wb") do |csv|
output_rows.each do |row|
csv << row
end
end
$stdout.puts "Writing CSV file: #{@options[:output]}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment