Created
August 11, 2009 23:04
-
-
Save zach-klippenstein/166171 to your computer and use it in GitHub Desktop.
Ugly-as-heck script for diffing two CSV files that differ in both number of rows and row content.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# | |
# Ugly-as-heck script for comparing two CSV files whose rows have changed, and | |
# where rows were added to or removed from the second file. Traditional diff tools | |
# can't make the connection between these two types of changes, so show every line as having | |
# changed. | |
# | |
# Uses git and gitk. | |
require 'fileutils' | |
include FileUtils | |
require 'pathname' | |
# Columns that are used to create intermediate diffs. | |
interm_cols = [0] | |
# Name used in temp directory | |
FILE='file' | |
def quiet_sh(*args) | |
system(*args) | |
# if args.is_a?(String) | |
# args += ' >/dev/null' | |
# system(args) | |
# else | |
# args << '>/dev/null' | |
# system(*args) | |
# end | |
end | |
module Git | |
module_function | |
def init | |
quiet_sh("git init") | |
end | |
def add(*files) | |
files.each { |file| quiet_sh("git", "add", file) } | |
end | |
def commit(msg='') | |
quiet_sh("git", "commit", "-a", "-m", msg) | |
end | |
def show(commit='HEAD') | |
if fork.nil? | |
quiet_sh("gitk --select-commit=#{commit}") | |
yield if block_given? | |
end | |
end | |
def tag(msg) | |
msg.gsub!(/ /, '_') | |
quiet_sh("git", "tag", msg) | |
end | |
end | |
def filter_cols(infile, outfile, sep, cols_to_filter) | |
while infile.gets | |
values = split(sep) | |
0.upto(values.length - 1) { |i| | |
if cols_to_filter.include?(i) | |
outfile.print values[i] | |
outfile.print sep unless i == values.length - 1 | |
end | |
} | |
outfile.puts | |
outfile.flush | |
end | |
end | |
def init(temp_dir, file1) | |
mkdir(temp_dir) | |
puts("Copying file1 to " + temp_dir.join(FILE)) | |
cp(file1, temp_dir.join(FILE)) | |
cd(temp_dir) | |
Git::init | |
Git::add(FILE) | |
Git::commit('initial commit: added file1') | |
end | |
def simplify(sep, cols_to_filter) | |
File.open(FILE, 'r') { |infile| | |
File.open(FILE + '.swap', 'w') { |outfile| | |
filter_cols(infile, outfile, sep, cols_to_filter) | |
} | |
} | |
mv(FILE + '.swap', FILE) | |
end | |
def run(file1, file2, sep, cols_to_filter) | |
temp_dir = Pathname.new("complex-diff-csv.tmp.#{$$}").expand_path | |
file1 = Pathname.new(file1).expand_path | |
file2 = Pathname.new(file2).expand_path | |
begin | |
init(temp_dir, file1) | |
puts("Simplifying file1...") | |
simplify(sep, cols_to_filter) | |
Git::commit('intermediate stage 1: removed detail columns') | |
puts ("Adding modifications introduced by file2...") | |
cp(file2, temp_dir.join(FILE)) | |
simplify(sep, cols_to_filter) | |
Git::commit('intermediate stage 2: added/deleted rows from file2') | |
Git::tag('This version shows the difference in rows') | |
puts ("Detailing file2...") | |
puts("Copying file1 to " + temp_dir.join(FILE)) | |
cp(file2, temp_dir.join(FILE)) | |
Git::commit('final stage: added detail columns back from file2') | |
Git::show('HEAD^') { rm_rf(temp_dir) } | |
rescue | |
puts "Exception caught, deleting '#{temp_dir}'..." | |
rm_rf(temp_dir) | |
raise | |
end | |
end | |
run(ARGV[0], ARGV[1], ',', interm_cols) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment