Skip to content

Instantly share code, notes, and snippets.

@danhealy
Created October 24, 2013 08:29
Show Gist options
  • Save danhealy/7133303 to your computer and use it in GitHub Desktop.
Save danhealy/7133303 to your computer and use it in GitHub Desktop.
Count duplicate lines in a git repo
#!/usr/bin/env ruby
class Array
def sum
inject(0.0) { |result, el| result + el }
end
def mean
sum / size
end
end
ALLOWED_DUPES = [
"(",
"{",
"[",
"]",
"}",
")",
"],",
"},",
"),",
"end",
"else",
"begin",
"next",
"break",
"private",
"super",
"return",
"retry",
"timestamps",
"return nil",
"return false",
"return true",
"respond_to do |format|",
"structure do",
"t.datetime :created_at",
"t.datetime :updated_at",
"has_paper_trail",
"class << self",
"#!/usr/bin/env ruby",
"def self.up",
"def self.down",
"t.timestamps",
"agent = Mechanize.new",
"rescue",
"rescue Exception => e",
"rescue Exception => ex",
"rescue RuntimeError => e",
"rescue => e",
"rescue => ex"
]
IGNORED_FILES = [
"schema.rb",
"db/migrate/",
"vendor/",
"spec/",
"test/"
]
authors = Hash.new do |h, k|
h[k] = Hash.new(0)
h[k][:lines_committed] = Hash.new(0)
h[k][:worst_commits] = Hash.new(0)
end
line_count = 0
c = "git log --unified=0 -p #{ARGV.join(' ')} -- \"*.rb\""
print "Processing: '#{c.strip}'"
IO.popen(c) do |f|
cur_author = nil
cur_commit = nil
cur_file = nil
cur_file_ignored = false
cur_lines = Hash.new(0)
while line = f.gets
next unless line.ascii_only?
line_count += 1
if (line_count % 1000) == 0
print "."
end
if (line_count % 100_000) == 0
print "\n"
print "Line #{line_count}"
end
case line
when /^diff/
cur_file = line.match(/^diff --git a\/(.*) b\/(.*)/)[2]
cur_file_ignored = false
IGNORED_FILES.each do |f|
if cur_file.include? f
cur_file_ignored = true
break
end
end
when /^commit/
if cur_author && cur_commit && cur_lines
cur_dupes = cur_lines.select do |h, k| k > 1 end.values.sum.to_i
authors[cur_author][:same_commit_dupes] += cur_dupes
authors[cur_author][:worst_commits][cur_commit] = cur_dupes
end
cur_lines = Hash.new(0)
cur_commit = line.match(/commit (.*)$/)[1]
authors[cur_author][:commits] += 1
when /^Author: /
m = line.match(/Author: ((\w*[ ]?)+).*$/)
if m && m[1]
cur_author = m[1].strip
authors[cur_author]
end
when /^\+[^\+]/
code = line.match(/^\+(.*)/)[1].strip
if code && (code.length > 0)
authors[cur_author][:lines_committed_count] += 1
unless (ALLOWED_DUPES.include? code) || (code[0] == "\#") || (cur_file_ignored)
authors[cur_author][:lines_committed][code] += 1
cur_lines[code] += 1
end
end
when /^\-[^\-]/
code = line.match(/^\-(.*)/)[1].strip
if code && (code.length > 0)
authors[cur_author][:lines_deleted] += 1
end
end
end
end
puts
puts "Processed #{line_count} lines of logs. Calculating final stats..."
authors.each do |author, stats|
dupes = stats[:lines_committed].select do |h, k| k > 1 end
total_dupes = dupes.values.sum
sorted_dupes = stats[:lines_committed].select do |h, k| k > 1 end.to_a.sort do |a, b| b[1] <=> a[1] end
sorted_commits = stats[:worst_commits].select do |h, k| k > 1 end.to_a.sort do |a, b| b[1] <=> a[1] end
puts "-"*80
puts "Stats for #{author}"
puts "Commits : #{stats[:commits]}"
puts "+Lines added : #{stats[:lines_committed_count]}"
puts "-Lines removed : #{stats[:lines_deleted]}"
puts "Total dupes : #{total_dupes}"
puts "Duped lines in single commits : #{stats[:same_commit_dupes]}"
puts "Dupes per commit (avg) : #{total_dupes / stats[:commits].to_f}"
sorted_dupes.take(5).each_index do |i|
puts "Top duped lines -- \##{i + 1} : #{sorted_dupes[i][1]} times == '#{sorted_dupes[i][0]}'"
end
sorted_commits.take(5).each_index do |i|
puts "Top commits with dupes -- \##{i + 1} : #{sorted_commits[i][1]} dupes == #{sorted_commits[i][0]}"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment