robotmay/compare.rb

## compare.rb
#!/usr/bin/env ruby
require 'optparse'
require 'csv'

@options = {
  :files => [],
  :pattern => /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i,
  :columns => [],
  :output => nil
}

@opts = OptionParser.new do |opts|
  opts.on "-f", "--files FILES", Array, "List of files to compare" do |files|
    @options[:files] = files.collect do |f|
      CSV.read File.expand_path(f)
    end
  end

  opts.on "-o", "--output OUTPUT", "Output file" do |output|
    @options[:output] = File.expand_path(output)
  end

  opts.on "-p", "--pattern PATTERN", "Regexp pattern for matching columns" do |pattern|
    @options[:pattern] = Regexp.new(pattern)
  end

  opts.on "-c", "--columns COLUMNS", Array, "Alternative to Regexp: specify the column numbers for matching" do |columns|
    @options[:columns] = columns.collect(&:to_i)
  end
end

@opts.parse!(ARGV)

file_row_counts = @options[:files].collect(&:count)
primary_file = @options[:files].shift
primary_file_column = if @options[:columns].size > 0
  @options[:columns].shift
else
  nil
end
rows_to_delete = []
needle_cache = []
matches = 0
non_matches = 0

def find_needle(row, column_number = nil)
  needle = if column_number.is_a?(Integer)
    $stdout.puts "Matching on column number #{column_number}"
    row[column_number]
  elsif @options[:pattern].is_a?(Regexp)
    $stdout.puts "Matching on regexp: #{@options[:pattern]}"
    row.find { |field| field =~ @options[:pattern] }
  end

  (needle || "").strip
end

primary_file.each_with_index do |row, row_index|
  needle = find_needle(row, primary_file_column)

  @options[:files].each_with_index do |file, file_index|
    needles = (needle_cache[file_index] ||= file.collect { |r| find_needle(r, @options[:columns][file_index]) })
    if needles.include?(needle)
      $stdout.puts "Matching needle: #{needle}"
      rows_to_delete << row
      matches += 1
    else
      $stdout.puts "Non-matching needle: #{needle}"
      non_matches += 1
    end
  end
end

$stdout.puts "Row counts: #{file_row_counts}"
$stdout.puts "Matches: #{matches}"
$stdout.puts "Non-matches: #{non_matches}"
$stdout.puts "Number to delete: #{rows_to_delete.count}"

output_rows = primary_file - rows_to_delete

if @options[:output]
  CSV.open(@options[:output], "wb") do |csv|
    output_rows.each do |row|
      csv << row
    end
  end
  $stdout.puts "Writing CSV file: #{@options[:output]}"
end
	#!/usr/bin/env ruby
	require 'optparse'
	require 'csv'

	@options = {
	:files => [],
	:pattern => /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i,
	:columns => [],
	:output => nil
	}

	@opts = OptionParser.new do \|opts\|
	opts.on "-f", "--files FILES", Array, "List of files to compare" do \|files\|
	@options[:files] = files.collect do \|f\|
	CSV.read File.expand_path(f)
	end
	end

	opts.on "-o", "--output OUTPUT", "Output file" do \|output\|
	@options[:output] = File.expand_path(output)
	end

	opts.on "-p", "--pattern PATTERN", "Regexp pattern for matching columns" do \|pattern\|
	@options[:pattern] = Regexp.new(pattern)
	end

	opts.on "-c", "--columns COLUMNS", Array, "Alternative to Regexp: specify the column numbers for matching" do \|columns\|
	@options[:columns] = columns.collect(&:to_i)
	end
	end

	@opts.parse!(ARGV)

	file_row_counts = @options[:files].collect(&:count)
	primary_file = @options[:files].shift
	primary_file_column = if @options[:columns].size > 0
	@options[:columns].shift
	else
	nil
	end
	rows_to_delete = []
	needle_cache = []
	matches = 0
	non_matches = 0

	def find_needle(row, column_number = nil)
	needle = if column_number.is_a?(Integer)
	$stdout.puts "Matching on column number #{column_number}"
	row[column_number]
	elsif @options[:pattern].is_a?(Regexp)
	$stdout.puts "Matching on regexp: #{@options[:pattern]}"
	row.find { \|field\| field =~ @options[:pattern] }
	end

	(needle \|\| "").strip
	end

	primary_file.each_with_index do \|row, row_index\|
	needle = find_needle(row, primary_file_column)

	@options[:files].each_with_index do \|file, file_index\|
	needles = (needle_cache[file_index] \|\|= file.collect { \|r\| find_needle(r, @options[:columns][file_index]) })
	if needles.include?(needle)
	$stdout.puts "Matching needle: #{needle}"
	rows_to_delete << row
	matches += 1
	else
	$stdout.puts "Non-matching needle: #{needle}"
	non_matches += 1
	end
	end
	end

	$stdout.puts "Row counts: #{file_row_counts}"
	$stdout.puts "Matches: #{matches}"
	$stdout.puts "Non-matches: #{non_matches}"
	$stdout.puts "Number to delete: #{rows_to_delete.count}"

	output_rows = primary_file - rows_to_delete

	if @options[:output]
	CSV.open(@options[:output], "wb") do \|csv\|
	output_rows.each do \|row\|
	csv << row
	end
	end
	$stdout.puts "Writing CSV file: #{@options[:output]}"
	end