Skip to content

Instantly share code, notes, and snippets.

@zach-klippenstein
Created August 11, 2009 23:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zach-klippenstein/166171 to your computer and use it in GitHub Desktop.
Save zach-klippenstein/166171 to your computer and use it in GitHub Desktop.
Ugly-as-heck script for diffing two CSV files that differ in both number of rows and row content.
#!/usr/bin/ruby
#
# Ugly-as-heck script for comparing two CSV files whose rows have changed, and
# where rows were added to or removed from the second file. Traditional diff tools
# can't make the connection between these two types of changes, so show every line as having
# changed.
#
# Uses git and gitk.
require 'fileutils'
include FileUtils
require 'pathname'
# Columns that are used to create intermediate diffs.
interm_cols = [0]
# Name used in temp directory
FILE='file'
def quiet_sh(*args)
system(*args)
# if args.is_a?(String)
# args += ' >/dev/null'
# system(args)
# else
# args << '>/dev/null'
# system(*args)
# end
end
module Git
module_function
def init
quiet_sh("git init")
end
def add(*files)
files.each { |file| quiet_sh("git", "add", file) }
end
def commit(msg='')
quiet_sh("git", "commit", "-a", "-m", msg)
end
def show(commit='HEAD')
if fork.nil?
quiet_sh("gitk --select-commit=#{commit}")
yield if block_given?
end
end
def tag(msg)
msg.gsub!(/ /, '_')
quiet_sh("git", "tag", msg)
end
end
def filter_cols(infile, outfile, sep, cols_to_filter)
while infile.gets
values = split(sep)
0.upto(values.length - 1) { |i|
if cols_to_filter.include?(i)
outfile.print values[i]
outfile.print sep unless i == values.length - 1
end
}
outfile.puts
outfile.flush
end
end
def init(temp_dir, file1)
mkdir(temp_dir)
puts("Copying file1 to " + temp_dir.join(FILE))
cp(file1, temp_dir.join(FILE))
cd(temp_dir)
Git::init
Git::add(FILE)
Git::commit('initial commit: added file1')
end
def simplify(sep, cols_to_filter)
File.open(FILE, 'r') { |infile|
File.open(FILE + '.swap', 'w') { |outfile|
filter_cols(infile, outfile, sep, cols_to_filter)
}
}
mv(FILE + '.swap', FILE)
end
def run(file1, file2, sep, cols_to_filter)
temp_dir = Pathname.new("complex-diff-csv.tmp.#{$$}").expand_path
file1 = Pathname.new(file1).expand_path
file2 = Pathname.new(file2).expand_path
begin
init(temp_dir, file1)
puts("Simplifying file1...")
simplify(sep, cols_to_filter)
Git::commit('intermediate stage 1: removed detail columns')
puts ("Adding modifications introduced by file2...")
cp(file2, temp_dir.join(FILE))
simplify(sep, cols_to_filter)
Git::commit('intermediate stage 2: added/deleted rows from file2')
Git::tag('This version shows the difference in rows')
puts ("Detailing file2...")
puts("Copying file1 to " + temp_dir.join(FILE))
cp(file2, temp_dir.join(FILE))
Git::commit('final stage: added detail columns back from file2')
Git::show('HEAD^') { rm_rf(temp_dir) }
rescue
puts "Exception caught, deleting '#{temp_dir}'..."
rm_rf(temp_dir)
raise
end
end
run(ARGV[0], ARGV[1], ',', interm_cols)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment