kapcod/join_csv.rb

## join_csv.rb
require 'csv'

# This will create join of 2 CSVs based on key-column without deduplication
# The output CSV by order of csv1
# more memory-efficient is to put csv1 longest table and csv2 shortest with less key column duplication
def join_csv(csv1_path, key1, csv2_path, key2, csv3_path, key_convertor: nil, ignore_cols: [], filter_csv1: nil, filter_csv2: nil)
  start_ts = Time.now.to_i
  puts "Reading #{csv2_path}, size: #{File.size(csv2_path)}..."
  csv2 = CSV.read(csv2_path, headers: true)

  index = Hash.new { |h, k| h[k] = [] }
  csv2.each_with_index do |r, i|
    next if filter_csv2 && !filter_csv2.call(r)

    value = r[key2]
    value = key_convertor.call(value) if value && key_convertor
    index[value] << r if value
    print "\rIndexing... #{i+1}/#{csv2.size}, unique keys: #{index.size}"
  end
  puts

  CSV.open(csv1_path, 'r', headers: true) do |csv1|
    csv1.first # need to read headers
    headers1 = [key1] + (csv1.headers - [key1]) - ignore_cols
    headers2 = csv2.headers - [key2] - ignore_cols
    csv1.rewind

    CSV.open(csv3_path, 'w', headers: headers1 + headers2, write_headers: true) do |out|
      total = 0
      csv1.each_with_index do |r1, i|
        next if filter_csv1 && !filter_csv1.call(r1)

        value = r1[key1]
        value = key_convertor.call(value) if value && key_convertor
        next unless value

        index[value].each do |r2|
          h = {}
          headers1.each { |c| h[c] = r1[c] }
          (headers2 - [key2]).each { |c| h[c] = r2[c] }
          out << h
          total += 1
        end

        print "\rJoining... #{i+1}, output rows: #{total}"
      end
    end
  end

  puts "\nDone in #{Time.now.to_i - start_ts}s"
end

join_csv(*ARGV) if $PROGRAM_NAME == __FILE__

## test_command.sh
ruby join_csv.rb test_file1.csv B test_file2.csv E test_output.csv

## test_file1.csv

          
            A
            B
            C

            
              1
              1
              1

            
              2
              2
              2

            
              3
              3
              3

            
              4
              1
              4

## test_file2.csv

          
            D
            E
            F

            
              a
              b
              c

            
              b
              1
              b

            
              c
              2
              c

            
              d
              4
              d

## test_output.csv

          
            B
            A
            C
            D
            F

            
              1
              1
              1
              b
              b

            
              2
              2
              2
              c
              c

            
              1
              4
              4
              b
              b
	require 'csv'

	# This will create join of 2 CSVs based on key-column without deduplication
	# The output CSV by order of csv1
	# more memory-efficient is to put csv1 longest table and csv2 shortest with less key column duplication
	def join_csv(csv1_path, key1, csv2_path, key2, csv3_path, key_convertor: nil, ignore_cols: [], filter_csv1: nil, filter_csv2: nil)
	start_ts = Time.now.to_i
	puts "Reading #{csv2_path}, size: #{File.size(csv2_path)}..."
	csv2 = CSV.read(csv2_path, headers: true)

	index = Hash.new { \|h, k\| h[k] = [] }
	csv2.each_with_index do \|r, i\|
	next if filter_csv2 && !filter_csv2.call(r)

	value = r[key2]
	value = key_convertor.call(value) if value && key_convertor
	index[value] << r if value
	print "\rIndexing... #{i+1}/#{csv2.size}, unique keys: #{index.size}"
	end
	puts

	CSV.open(csv1_path, 'r', headers: true) do \|csv1\|
	csv1.first # need to read headers
	headers1 = [key1] + (csv1.headers - [key1]) - ignore_cols
	headers2 = csv2.headers - [key2] - ignore_cols
	csv1.rewind

	CSV.open(csv3_path, 'w', headers: headers1 + headers2, write_headers: true) do \|out\|
	total = 0
	csv1.each_with_index do \|r1, i\|
	next if filter_csv1 && !filter_csv1.call(r1)

	value = r1[key1]
	value = key_convertor.call(value) if value && key_convertor
	next unless value

	index[value].each do \|r2\|
	h = {}
	headers1.each { \|c\| h[c] = r1[c] }
	(headers2 - [key2]).each { \|c\| h[c] = r2[c] }
	out << h
	total += 1
	end

	print "\rJoining... #{i+1}, output rows: #{total}"
	end
	end
	end

	puts "\nDone in #{Time.now.to_i - start_ts}s"
	end

	join_csv(*ARGV) if $PROGRAM_NAME == __FILE__