Skip to content

Instantly share code, notes, and snippets.

@thegedge
Last active May 14, 2020 13:42
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thegedge/51f2da7a2ce8e7067e949f3c4e806d92 to your computer and use it in GitHub Desktop.
Save thegedge/51f2da7a2ce8e7067e949f3c4e806d92 to your computer and use it in GitHub Desktop.
Attempt to find cohesive sets of methods in a Ruby file, based on how it's changed with other files in a repo
#!/usr/bin/ruby --disable-gems
#
# # Prerequisites
#
# # Example usage
#
# find_cohesive_changes <file>
#
require "open3"
require "set"
require "stringio"
require "time"
class GitClient
def initialize(git_dir)
@git_dir = git_dir
end
def call(*args)
stdout, stderr, status = Open3.capture3("git", "-C", @git_dir, *args)
return stdout if status.success?
STDERR.puts("Failed to run git command: #{args.join(" ")}")
STDERR.puts(stderr)
exit status.to_i
end
end
module Extractors
FileGrouping = Struct.new(:key, :files, :commits, :weight, keyword_init: true)
class Git
CommitData = Struct.new(
"CommitData",
:author_date, :sha, :subject, :first_parent_sha, :second_parent_sha, :changed_files
)
private_constant :CommitData
COMMIT_DATA_SEPARATOR = "--->"
private_constant :COMMIT_DATA_SEPARATOR
SUBJECT_SEPARATOR = ">>>"
private_constant :SUBJECT_SEPARATOR
MERGE_COMMIT_PR_REGEX = /Merge pull request #(\d+) from/
private_constant :MERGE_COMMIT_PR_REGEX
def initialize(git_client:, since:)
@git_client = git_client
@since = since
end
def extract
now = Time.now
oldest = now - master_commits.entries.last.last.author_date
youngest = now - master_commits.entries.first.last.author_date
range = (oldest - youngest).to_f
master_commits.map do |_sha, master_commit|
weight = 1 - ((now - master_commit.author_date) / range)
branch_commits = branch_commits(master_commit)
FileGrouping.new(
key: pr_number(master_commit),
commits: branch_commits,
files: branch_commits.flat_map(&:changed_files).uniq.sort,
weight: weight,
)
end
end
private
attr_reader :since
def pr_number(commit)
match = commit.subject.match(MERGE_COMMIT_PR_REGEX)
return nil unless match
match[1]
end
def branch_commits(merge_commit)
branch_commits = [merge_commit]
# Traverse backwards through the branch until we return to master
commit = commits[merge_commit.second_parent_sha]
while commit && !master_commits.key?(commit.sha)
branch_commits << commit
commit = commits[commit.first_parent_sha]
end
branch_commits
end
def master_commits
@master_commits ||= begin
master_commits = {}
# Traverse the first parent of every merge commit (similar to the --first-parent switch
# in some git commands). This should eventually end at the root commit.
_, commit = commits.first
while commit
master_commits[commit.sha] = commit
# First parent of a merge commit is on the same branch
commit = commits[commit.first_parent_sha]
end
master_commits
end
end
def commits
@commits ||= begin
format_arg = "--format=format:#{COMMIT_DATA_SEPARATOR}%aI %H %P #{SUBJECT_SEPARATOR} %s"
since_arg = "--since=\"#{since}\""
branch_arg = "origin/master"
commit_data = @git_client.call("log", "--name-status", format_arg, since_arg, branch_arg)
commit_data = commit_data.split(COMMIT_DATA_SEPARATOR)
# Keep track of files that are deleted. There's no need to keep them for consideration, so
# we'll drop them if we encounter them again. Also, since we're going in chronologically
# descending order, we should capture everything in a single pass.
deleted_files = Set.new
# First entry will always be empty, because there's a separator at the beginning of the file
commit_data.drop(1).each_with_object({}) do |data, commits|
lines = data.lines.map(&:chomp)
header, subject = lines.first.split(SUBJECT_SEPARATOR)
changed_files = Array(lines[1..-1]).map do |line|
next nil if line.empty?
status, file1, file2 = line.split
file = case status
when 'D'
deleted_files << file1
nil
when /R\d+/
deleted_files << file1
file2
else
file1
end
if deleted_files.include?(file)
nil
else
file
end
end
author_date, sha, first_parent_sha, second_parent_sha = header.split
commits[sha] = CommitData.new(
Time.iso8601(author_date),
sha,
subject,
first_parent_sha,
second_parent_sha,
changed_files.compact
)
end
end
end
end
class RubyMethods
Method = Struct.new(:name, :range, keyword_init: true)
def initialize
#
end
def methods(contents)
method_definitions(parse(contents))
end
private
def method_definitions(node, methods = [])
return unless node
return unless node.is_a?(::RubyVM::AbstractSyntaxTree::Node)
name = case node.type.downcase
# def foo; ...; end
when :def, :defn
node.children[0].to_s
# def foo.bar; ...; end
when :defs
node.children[1].to_s
else
nil
end
if name
methods << Method.new(
name: name,
range: (node.first_lineno .. node.last_lineno),
)
end
node.children.each do |child|
method_definitions(child, methods)
end
methods
end
def parse(content)
$stderr = StringIO.new
RubyVM::AbstractSyntaxTree.parse(content)
rescue SyntaxError => e
STDERR.puts("#{file_path}: could not parse (#{e})")
nil
ensure
$stderr = STDERR
end
end
class GitDiff
Diff = Struct.new(:name_before, :name_after, :chunks, keyword_init: true)
Chunk = Struct.new(:before, :after, keyword_init: true)
def initialize(git_client)
@git_client = git_client
end
def diffs_for(sha)
show_output = @git_client.call("show", "--format=oneline", "--unified=0", sha).lines
diffs(show_output).map do |lines|
# If file is simply deleted or renamed, ignore
next if lines.length < 4
# We skip the first two lines, that look like this
#
# diff --git a/app/models/model.rb b/app/models/model.rb
# index 123456789abc..abc123def456 100644
#
# Then get the names from the next two lines, that look like this:
#
# --- a/app/models/model.rb
# +++ b/app/models/model.rb
#
name_before = lines[2].split[1][2..]
name_after = lines[3].split[1][2..]
# There can be many of these under the above header
#
# @@ -2880,2 +2876 @@ functional context
# - some text that was here before
# - more previous text
# + some new stuff getting added
#
chunks = lines.slice_when { |_, next_line| next_line.start_with?("@@") }
chunks = chunks.map do |chunk_lines|
_, removals, additions, * = chunk_lines[0].split
removals = removals[1..].split(",").map(&:to_i)
additions = additions[1..].split(",").map(&:to_i)
# If no comma, implicit chunk length of 1
removals << 1 if removals.length == 1
additions << 1 if additions.length == 1
Chunk.new(
before: range(*removals),
after: range(*additions),
)
end
Diff.new(
name_before: name_before,
name_after: name_after,
chunks: chunks,
)
end
end
private
def range(start, length)
if length == 0
-5 .. -5 # arbitrary range that won't make sense in the context of file line numbers
else
start ... (start + length)
end
end
def diffs(output)
output
.drop(1)
.map(&:chomp)
.slice_when { |_, next_line| next_line.start_with?("diff") }
end
end
end
def ranges_overlap?(a, b)
a_end = a.exclude_end? ? a.end - 1 : a.end
b_end = b.exclude_end? ? b.end - 1 : b.end
b.begin <= a_end && a.begin <= b_end
end
# TODO parameterize these constants
INCLUDE_GLOBS = [
"**/*.rb",
]
EXCLUDE_GLOBS = [
"config/**/*",
"db/**/*",
"**/test/**/*",
"**/spec/**/*",
#"**/graph{ql,_api}/**/*",
]
SINCE = "2 months ago"
FILE_COUNT_THRESHOLD = 15
file = ARGV[0]
git_client = GitClient.new(Dir.pwd)
ruby_methods = Extractors::RubyMethods.new
git_differ = Extractors::GitDiff.new(git_client)
current_methods = ruby_methods.methods(File.read(file)).map(&:name).to_set
pull_requests = Extractors::Git.new(git_client: git_client, since: SINCE).extract
pull_requests.each do |pr|
next if pr.files.length > FILE_COUNT_THRESHOLD
affected_methods = pr.commits.flat_map do |commit|
next [] unless commit.changed_files.include?(file)
diffs = git_differ.diffs_for(commit.sha)
file_diff = diffs.find { |diff| diff.name_after == file }
file_contents = git_client.call("show", "#{commit.sha}:#{file}")
methods = ruby_methods.methods(file_contents)
# So what we're going to do is go over the chunks and match the lines getting added
# to the detected methods. This will roughly let us know what's getting changed.
methods.select do |method|
next unless current_methods.include?(method.name)
file_diff.chunks.any? do |chunk|
ranges_overlap?(method.range, chunk.after)
end
end
end
components = pr.files.map do |file|
next unless file.start_with?("components")
file.split("/", 3)[1]
end
unless affected_methods.empty?
puts pr.key
puts " Methods:"
puts affected_methods.map(&:name).uniq.sort.map { |x| " -#{x}" }
puts ""
puts " Components:"
puts components.compact.uniq.sort.map { |x| " -#{x}" }
puts ""
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment