Skip to content

Instantly share code, notes, and snippets.

@danlynn
Created June 15, 2020 22:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danlynn/13192ae6cf9328e7b0c7b62e2c1c0ba1 to your computer and use it in GitHub Desktop.
Save danlynn/13192ae6cf9328e7b0c7b62e2c1c0ba1 to your computer and use it in GitHub Desktop.
require 'csv'
merged_file_path = '/Users/danlynn/Documents/temp/merge-csvs/ahoy_report-20200607-20200613-merge.csv'
file_paths = [
'/Users/danlynn/Documents/temp/merge-csvs/ahoy_report-20200607-20200613-interim.csv',
'/Users/danlynn/Documents/temp/merge-csvs/ahoy_report-20200607-20200613-legacy.csv',
'/Users/danlynn/Documents/temp/merge-csvs/ahoy_report-20200607-20200613-new.csv'
]
# Merge column titles from each file so that we can start
# writing a new file that includes all titles.
merged_col_titles = {}
file_paths.each do |file_path|
begin
file = File.open(file_path)
line = file.readline
ensure
file.close
end
cols_in_file = 0
unique_col_title_count = 0
fields = CSV.parse(line)[0]
fields.each do |col_title|
cols_in_file += 1
unless merged_col_titles[col_title]
merged_col_titles[col_title] = merged_col_titles.size
unique_col_title_count += 1
end
end
puts "#{File.basename(file_path)}:"
puts " cols in file: #{cols_in_file}"
puts " new unique col titles: #{unique_col_title_count}"
puts " total unique col titles: #{merged_col_titles.size}"
end
puts "\nmerged col titles:"
merged_col_titles.each do |col_title, col_idx|
puts " #{"%3.3s" % col_idx}: #{col_title}"
end
read_count = 0
combined_count = 0
written_count = 0
merged_rows = {}
file_paths.each do |file_path|
puts "\n=== file: #{File.basename(file_path)}"
col_titles = nil
CSV.foreach(file_path, 'r') do |row|
unless col_titles
col_titles ||= row
else
read_count += 1
merged_row = nil
row.each_with_index do |cell, col_idx|
if col_idx == 0
unless merged_rows[row[0]]
merged_rows[row[0]] ||= Array.new(merged_col_titles.size)
merged_rows[row[0]][0] = row[0]
else
combined_count += 1
end
merged_row = merged_rows[row[0]]
puts " row: #{row[0]}"
else
col_mapped_idx = merged_col_titles[col_titles[col_idx]]
# if col_titles[col_idx] == 'Ahoy!wich Ice Cream Sandwiches'
# puts "test: map #{col_idx} -> #{col_mapped_idx}: #{cell}"
# end
# puts "--- map: #{col_titles[col_idx]}: #{col_idx} -> #{col_mapped_idx}"
merged_row[col_mapped_idx] ||= 0
merged_row[col_mapped_idx] += cell.to_i
end
end
end
end
end
CSV.open(merged_file_path, 'w') do |merged_csv|
merged_csv << merged_col_titles.keys
sorted_merged_rows = merged_rows.values.sort_by{|row| row[0]}
sorted_merged_rows.each do |merged_row|
written_count += 1
merged_csv << merged_row
end
end
puts "\ncomplete:"
puts " read: #{read_count}"
puts " combined: #{combined_count}"
puts " written: #{written_count}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment