Skip to content

Instantly share code, notes, and snippets.

@murphyslaw
Created May 28, 2011 07:49
Show Gist options
  • Save murphyslaw/996701 to your computer and use it in GitHub Desktop.
Save murphyslaw/996701 to your computer and use it in GitHub Desktop.
A ruby script that removes duplicate rows in a csv file. Duplicates are found based on an identifier column and a criteria column, which are configurable.
#!/usr/bin/ruby -w
require 'csv'
require 'active_support/core_ext'
class Parser
attr_accessor :input_folder
attr_accessor :output_folder
attr_accessor :filename
attr_accessor :seperator
attr_accessor :column_names
attr_accessor :entries
attr_accessor :output
attr_accessor :identifier
attr_accessor :criteria
attr_accessor :output_columns
def initialize(filename, identifier, criteria, options = {})
options.reverse_merge!({
:input_folder => "../csv/",
:output_folder => "../output/",
:output_columns => :all,
})
@filename = filename
@input_folder = options[:input_folder]
@output_folder = options[:output_folder]
@identifier = identifier
@criteria = criteria
@seperator = ";"
@output_columns = options[:output_columns]
end
def input_path
input_folder + filename
end
def output_path
output_folder + filename
end
def read_csv
@entries = []
@output = []
CSV.open(input_path, 'r', @seperator) do |row|
@entries << row
end
@column_names = entries.delete_at(0)
end
def remove_duplicates
identifier_index = column_names.index(identifier)
criteria_index = column_names.index(criteria)
grouped = entries.group_by { |entry| entry[identifier_index] }
grouped.each do |key, rows|
values = rows.map { |row| row[criteria_index] }.uniq
rows.each do |row|
criteria = row[criteria_index]
if values.include?(criteria)
output << filter_row(row)
values.delete(criteria)
end
end
end
end
def filter_row(row)
return row if output_columns == :all
output_columns.inject([]) do |filtered_row, column|
index = column_names.index(column)
filtered_row << row[index]
end
end
def write_csv
CSV.open(output_path, "w", seperator) do |csv|
csv << filter_row(column_names)
output.each { |row| csv << row }
end
end
def statistic
size = output.size
total = entries.size
criteria_index = column_names.index(criteria)
criteria_values = entries.map { |row| row[criteria_index] }.uniq
empty = 0
empty += output.select { |row| row[criteria_index].blank? }.size
identifier_per_criteria = criteria_values.inject("") do |string, field|
hits = output.select { |row| row[criteria_index] == field }.size
string += "#{field} (#{hits}); "
end
puts
puts " #total: #{total}"
puts " #rows: #{size}"
puts " empty: #{empty}"
puts " #{column_names[criteria_index]}: #{identifier_per_criteria}\n\n\n"
end
def run
puts "=> #{filename}"
read_csv
remove_duplicates
write_csv
statistic
end
end
filenames = ["file1.csv", "file2.csv", "file3.csv"]
identifier = "ID"
criteria = "City"
options = {
:output_columns => :all,
}
filenames.each do |filename|
Parser.new(filename, identifier, criteria, options).run
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment