Skip to content

Instantly share code, notes, and snippets.

@kapcod
Last active August 3, 2022 13:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kapcod/3d511197b73f8d9f924cb4175e1c4400 to your computer and use it in GitHub Desktop.
Save kapcod/3d511197b73f8d9f924cb4175e1c4400 to your computer and use it in GitHub Desktop.
Join 2 unsorted CSV files using key column, requires loading 1 of the 2 CSV in memory. Allows filtering.
require 'csv'
# This will create join of 2 CSVs based on key-column without deduplication
# The output CSV by order of csv1
# more memory-efficient is to put csv1 longest table and csv2 shortest with less key column duplication
def join_csv(csv1_path, key1, csv2_path, key2, csv3_path, key_convertor: nil, ignore_cols: [], filter_csv1: nil, filter_csv2: nil)
start_ts = Time.now.to_i
puts "Reading #{csv2_path}, size: #{File.size(csv2_path)}..."
csv2 = CSV.read(csv2_path, headers: true)
index = Hash.new { |h, k| h[k] = [] }
csv2.each_with_index do |r, i|
next if filter_csv2 && !filter_csv2.call(r)
value = r[key2]
value = key_convertor.call(value) if value && key_convertor
index[value] << r if value
print "\rIndexing... #{i+1}/#{csv2.size}, unique keys: #{index.size}"
end
puts
CSV.open(csv1_path, 'r', headers: true) do |csv1|
csv1.first # need to read headers
headers1 = [key1] + (csv1.headers - [key1]) - ignore_cols
headers2 = csv2.headers - [key2] - ignore_cols
csv1.rewind
CSV.open(csv3_path, 'w', headers: headers1 + headers2, write_headers: true) do |out|
total = 0
csv1.each_with_index do |r1, i|
next if filter_csv1 && !filter_csv1.call(r1)
value = r1[key1]
value = key_convertor.call(value) if value && key_convertor
next unless value
index[value].each do |r2|
h = {}
headers1.each { |c| h[c] = r1[c] }
(headers2 - [key2]).each { |c| h[c] = r2[c] }
out << h
total += 1
end
print "\rJoining... #{i+1}, output rows: #{total}"
end
end
end
puts "\nDone in #{Time.now.to_i - start_ts}s"
end
join_csv(*ARGV) if $PROGRAM_NAME == __FILE__
ruby join_csv.rb test_file1.csv B test_file2.csv E test_output.csv
A B C
1 1 1
2 2 2
3 3 3
4 1 4
D E F
a b c
b 1 b
c 2 c
d 4 d
B A C D F
1 1 1 b b
2 2 2 c c
1 4 4 b b
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment