Created
October 24, 2013 08:29
-
-
Save danhealy/7133303 to your computer and use it in GitHub Desktop.
Count duplicate lines in a git repo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
class Array | |
def sum | |
inject(0.0) { |result, el| result + el } | |
end | |
def mean | |
sum / size | |
end | |
end | |
ALLOWED_DUPES = [ | |
"(", | |
"{", | |
"[", | |
"]", | |
"}", | |
")", | |
"],", | |
"},", | |
"),", | |
"end", | |
"else", | |
"begin", | |
"next", | |
"break", | |
"private", | |
"super", | |
"return", | |
"retry", | |
"timestamps", | |
"return nil", | |
"return false", | |
"return true", | |
"respond_to do |format|", | |
"structure do", | |
"t.datetime :created_at", | |
"t.datetime :updated_at", | |
"has_paper_trail", | |
"class << self", | |
"#!/usr/bin/env ruby", | |
"def self.up", | |
"def self.down", | |
"t.timestamps", | |
"agent = Mechanize.new", | |
"rescue", | |
"rescue Exception => e", | |
"rescue Exception => ex", | |
"rescue RuntimeError => e", | |
"rescue => e", | |
"rescue => ex" | |
] | |
IGNORED_FILES = [ | |
"schema.rb", | |
"db/migrate/", | |
"vendor/", | |
"spec/", | |
"test/" | |
] | |
authors = Hash.new do |h, k| | |
h[k] = Hash.new(0) | |
h[k][:lines_committed] = Hash.new(0) | |
h[k][:worst_commits] = Hash.new(0) | |
end | |
line_count = 0 | |
c = "git log --unified=0 -p #{ARGV.join(' ')} -- \"*.rb\"" | |
print "Processing: '#{c.strip}'" | |
IO.popen(c) do |f| | |
cur_author = nil | |
cur_commit = nil | |
cur_file = nil | |
cur_file_ignored = false | |
cur_lines = Hash.new(0) | |
while line = f.gets | |
next unless line.ascii_only? | |
line_count += 1 | |
if (line_count % 1000) == 0 | |
print "." | |
end | |
if (line_count % 100_000) == 0 | |
print "\n" | |
print "Line #{line_count}" | |
end | |
case line | |
when /^diff/ | |
cur_file = line.match(/^diff --git a\/(.*) b\/(.*)/)[2] | |
cur_file_ignored = false | |
IGNORED_FILES.each do |f| | |
if cur_file.include? f | |
cur_file_ignored = true | |
break | |
end | |
end | |
when /^commit/ | |
if cur_author && cur_commit && cur_lines | |
cur_dupes = cur_lines.select do |h, k| k > 1 end.values.sum.to_i | |
authors[cur_author][:same_commit_dupes] += cur_dupes | |
authors[cur_author][:worst_commits][cur_commit] = cur_dupes | |
end | |
cur_lines = Hash.new(0) | |
cur_commit = line.match(/commit (.*)$/)[1] | |
authors[cur_author][:commits] += 1 | |
when /^Author: / | |
m = line.match(/Author: ((\w*[ ]?)+).*$/) | |
if m && m[1] | |
cur_author = m[1].strip | |
authors[cur_author] | |
end | |
when /^\+[^\+]/ | |
code = line.match(/^\+(.*)/)[1].strip | |
if code && (code.length > 0) | |
authors[cur_author][:lines_committed_count] += 1 | |
unless (ALLOWED_DUPES.include? code) || (code[0] == "\#") || (cur_file_ignored) | |
authors[cur_author][:lines_committed][code] += 1 | |
cur_lines[code] += 1 | |
end | |
end | |
when /^\-[^\-]/ | |
code = line.match(/^\-(.*)/)[1].strip | |
if code && (code.length > 0) | |
authors[cur_author][:lines_deleted] += 1 | |
end | |
end | |
end | |
end | |
puts | |
puts "Processed #{line_count} lines of logs. Calculating final stats..." | |
authors.each do |author, stats| | |
dupes = stats[:lines_committed].select do |h, k| k > 1 end | |
total_dupes = dupes.values.sum | |
sorted_dupes = stats[:lines_committed].select do |h, k| k > 1 end.to_a.sort do |a, b| b[1] <=> a[1] end | |
sorted_commits = stats[:worst_commits].select do |h, k| k > 1 end.to_a.sort do |a, b| b[1] <=> a[1] end | |
puts "-"*80 | |
puts "Stats for #{author}" | |
puts "Commits : #{stats[:commits]}" | |
puts "+Lines added : #{stats[:lines_committed_count]}" | |
puts "-Lines removed : #{stats[:lines_deleted]}" | |
puts "Total dupes : #{total_dupes}" | |
puts "Duped lines in single commits : #{stats[:same_commit_dupes]}" | |
puts "Dupes per commit (avg) : #{total_dupes / stats[:commits].to_f}" | |
sorted_dupes.take(5).each_index do |i| | |
puts "Top duped lines -- \##{i + 1} : #{sorted_dupes[i][1]} times == '#{sorted_dupes[i][0]}'" | |
end | |
sorted_commits.take(5).each_index do |i| | |
puts "Top commits with dupes -- \##{i + 1} : #{sorted_commits[i][1]} dupes == #{sorted_commits[i][0]}" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment