Skip to content

@rklemme /find-duplicate-id-instance.rb
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Solution suggestion for "Help with ideas for finding dups on very large file"
#!/usr/bin/ruby -w
Entry = Struct.new :id, :instance do
def self.parse(line)
if /ID=\s*'([^']*)'\s+INSTANCE=\s*'([^']*)'/ =~ line
new $1, $2
else
raise "Cannot parse: %p" % line
end
end
end
# Phase 1: count occurrences of all pairs
entries = Hash.new 0
ARGV.each do |file|
File.foreach file do |line|
entries[Entry.parse(line)] += 1 rescue nil # ignore
end
end
# save some memory, not necessarily needed
entries.delete_if {|k, v| v < 2}
# Phase 2: print only dupes
ARGV.each do |file|
File.foreach file do |line|
puts line if entries[Entry.parse(line)] > 1 rescue nil # ignore
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.