Last active
August 29, 2015 14:20
-
-
Save dmolesUC/ac525c61b25d200a2cfb to your computer and use it in GitHub Desktop.
Using fuzzy hashing to identify source code duplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Notes: | |
# | |
# - Depends on gem russdeep-1.2.1 | |
# - cutoff of 65 is arbitrary | |
# - results may be unreliable for files < 4k | |
# (i.e. most source files) | |
# - config files for different environments | |
# are often crazy-similar | |
# | |
# Output for each pair: | |
# | |
# [similarity score] | |
# [file size] [file name] | |
# [file size] [file name] | |
require 'ssdeep' | |
require 'set' | |
folder = '.' | |
hashes = {} | |
Dir.glob("#{folder}/**/*.rb") do |file| | |
hash = Ssdeep.from_file(file) | |
hashes[file] = hash | |
end | |
# TODO: Instead of using sets, just iterate over first half, to only compare once | |
scores = Hash.new{|h, k| h[k] = Set.new} | |
hashes.each do |file1, hash1| | |
hashes.each do |file2, hash2| | |
unless file1 == file2 | |
score = Ssdeep.compare(hash1, hash2) | |
scores[score] << SortedSet.new([file1, file2]) if score > 0 | |
end | |
end | |
end | |
scores.sort.each do |score, pairs| | |
pairs.each do |pair| | |
if score > 65 | |
#puts "#{score}\t#{pair}" | |
#puts %x( diff #{pair[0]} #{pair[1]} ) | |
puts score | |
pair.each { |file| puts "\t#{File.size(file)}\t#{file}" } | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment