Last active
May 14, 2020 13:42
-
-
Save thegedge/51f2da7a2ce8e7067e949f3c4e806d92 to your computer and use it in GitHub Desktop.
Attempt to find cohesive sets of methods in a Ruby file, based on how it's changed with other files in a repo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby --disable-gems | |
# | |
# # Prerequisites | |
# | |
# # Example usage | |
# | |
# find_cohesive_changes <file> | |
# | |
require "open3" | |
require "set" | |
require "stringio" | |
require "time" | |
class GitClient | |
def initialize(git_dir) | |
@git_dir = git_dir | |
end | |
def call(*args) | |
stdout, stderr, status = Open3.capture3("git", "-C", @git_dir, *args) | |
return stdout if status.success? | |
STDERR.puts("Failed to run git command: #{args.join(" ")}") | |
STDERR.puts(stderr) | |
exit status.to_i | |
end | |
end | |
module Extractors | |
FileGrouping = Struct.new(:key, :files, :commits, :weight, keyword_init: true) | |
class Git | |
CommitData = Struct.new( | |
"CommitData", | |
:author_date, :sha, :subject, :first_parent_sha, :second_parent_sha, :changed_files | |
) | |
private_constant :CommitData | |
COMMIT_DATA_SEPARATOR = "--->" | |
private_constant :COMMIT_DATA_SEPARATOR | |
SUBJECT_SEPARATOR = ">>>" | |
private_constant :SUBJECT_SEPARATOR | |
MERGE_COMMIT_PR_REGEX = /Merge pull request #(\d+) from/ | |
private_constant :MERGE_COMMIT_PR_REGEX | |
def initialize(git_client:, since:) | |
@git_client = git_client | |
@since = since | |
end | |
def extract | |
now = Time.now | |
oldest = now - master_commits.entries.last.last.author_date | |
youngest = now - master_commits.entries.first.last.author_date | |
range = (oldest - youngest).to_f | |
master_commits.map do |_sha, master_commit| | |
weight = 1 - ((now - master_commit.author_date) / range) | |
branch_commits = branch_commits(master_commit) | |
FileGrouping.new( | |
key: pr_number(master_commit), | |
commits: branch_commits, | |
files: branch_commits.flat_map(&:changed_files).uniq.sort, | |
weight: weight, | |
) | |
end | |
end | |
private | |
attr_reader :since | |
def pr_number(commit) | |
match = commit.subject.match(MERGE_COMMIT_PR_REGEX) | |
return nil unless match | |
match[1] | |
end | |
def branch_commits(merge_commit) | |
branch_commits = [merge_commit] | |
# Traverse backwards through the branch until we return to master | |
commit = commits[merge_commit.second_parent_sha] | |
while commit && !master_commits.key?(commit.sha) | |
branch_commits << commit | |
commit = commits[commit.first_parent_sha] | |
end | |
branch_commits | |
end | |
def master_commits | |
@master_commits ||= begin | |
master_commits = {} | |
# Traverse the first parent of every merge commit (similar to the --first-parent switch | |
# in some git commands). This should eventually end at the root commit. | |
_, commit = commits.first | |
while commit | |
master_commits[commit.sha] = commit | |
# First parent of a merge commit is on the same branch | |
commit = commits[commit.first_parent_sha] | |
end | |
master_commits | |
end | |
end | |
def commits | |
@commits ||= begin | |
format_arg = "--format=format:#{COMMIT_DATA_SEPARATOR}%aI %H %P #{SUBJECT_SEPARATOR} %s" | |
since_arg = "--since=\"#{since}\"" | |
branch_arg = "origin/master" | |
commit_data = @git_client.call("log", "--name-status", format_arg, since_arg, branch_arg) | |
commit_data = commit_data.split(COMMIT_DATA_SEPARATOR) | |
# Keep track of files that are deleted. There's no need to keep them for consideration, so | |
# we'll drop them if we encounter them again. Also, since we're going in chronologically | |
# descending order, we should capture everything in a single pass. | |
deleted_files = Set.new | |
# First entry will always be empty, because there's a separator at the beginning of the file | |
commit_data.drop(1).each_with_object({}) do |data, commits| | |
lines = data.lines.map(&:chomp) | |
header, subject = lines.first.split(SUBJECT_SEPARATOR) | |
changed_files = Array(lines[1..-1]).map do |line| | |
next nil if line.empty? | |
status, file1, file2 = line.split | |
file = case status | |
when 'D' | |
deleted_files << file1 | |
nil | |
when /R\d+/ | |
deleted_files << file1 | |
file2 | |
else | |
file1 | |
end | |
if deleted_files.include?(file) | |
nil | |
else | |
file | |
end | |
end | |
author_date, sha, first_parent_sha, second_parent_sha = header.split | |
commits[sha] = CommitData.new( | |
Time.iso8601(author_date), | |
sha, | |
subject, | |
first_parent_sha, | |
second_parent_sha, | |
changed_files.compact | |
) | |
end | |
end | |
end | |
end | |
class RubyMethods | |
Method = Struct.new(:name, :range, keyword_init: true) | |
def initialize | |
# | |
end | |
def methods(contents) | |
method_definitions(parse(contents)) | |
end | |
private | |
def method_definitions(node, methods = []) | |
return unless node | |
return unless node.is_a?(::RubyVM::AbstractSyntaxTree::Node) | |
name = case node.type.downcase | |
# def foo; ...; end | |
when :def, :defn | |
node.children[0].to_s | |
# def foo.bar; ...; end | |
when :defs | |
node.children[1].to_s | |
else | |
nil | |
end | |
if name | |
methods << Method.new( | |
name: name, | |
range: (node.first_lineno .. node.last_lineno), | |
) | |
end | |
node.children.each do |child| | |
method_definitions(child, methods) | |
end | |
methods | |
end | |
def parse(content) | |
$stderr = StringIO.new | |
RubyVM::AbstractSyntaxTree.parse(content) | |
rescue SyntaxError => e | |
STDERR.puts("#{file_path}: could not parse (#{e})") | |
nil | |
ensure | |
$stderr = STDERR | |
end | |
end | |
class GitDiff | |
Diff = Struct.new(:name_before, :name_after, :chunks, keyword_init: true) | |
Chunk = Struct.new(:before, :after, keyword_init: true) | |
def initialize(git_client) | |
@git_client = git_client | |
end | |
def diffs_for(sha) | |
show_output = @git_client.call("show", "--format=oneline", "--unified=0", sha).lines | |
diffs(show_output).map do |lines| | |
# If file is simply deleted or renamed, ignore | |
next if lines.length < 4 | |
# We skip the first two lines, that look like this | |
# | |
# diff --git a/app/models/model.rb b/app/models/model.rb | |
# index 123456789abc..abc123def456 100644 | |
# | |
# Then get the names from the next two lines, that look like this: | |
# | |
# --- a/app/models/model.rb | |
# +++ b/app/models/model.rb | |
# | |
name_before = lines[2].split[1][2..] | |
name_after = lines[3].split[1][2..] | |
# There can be many of these under the above header | |
# | |
# @@ -2880,2 +2876 @@ functional context | |
# - some text that was here before | |
# - more previous text | |
# + some new stuff getting added | |
# | |
chunks = lines.slice_when { |_, next_line| next_line.start_with?("@@") } | |
chunks = chunks.map do |chunk_lines| | |
_, removals, additions, * = chunk_lines[0].split | |
removals = removals[1..].split(",").map(&:to_i) | |
additions = additions[1..].split(",").map(&:to_i) | |
# If no comma, implicit chunk length of 1 | |
removals << 1 if removals.length == 1 | |
additions << 1 if additions.length == 1 | |
Chunk.new( | |
before: range(*removals), | |
after: range(*additions), | |
) | |
end | |
Diff.new( | |
name_before: name_before, | |
name_after: name_after, | |
chunks: chunks, | |
) | |
end | |
end | |
private | |
def range(start, length) | |
if length == 0 | |
-5 .. -5 # arbitrary range that won't make sense in the context of file line numbers | |
else | |
start ... (start + length) | |
end | |
end | |
def diffs(output) | |
output | |
.drop(1) | |
.map(&:chomp) | |
.slice_when { |_, next_line| next_line.start_with?("diff") } | |
end | |
end | |
end | |
def ranges_overlap?(a, b) | |
a_end = a.exclude_end? ? a.end - 1 : a.end | |
b_end = b.exclude_end? ? b.end - 1 : b.end | |
b.begin <= a_end && a.begin <= b_end | |
end | |
# TODO parameterize these constants | |
INCLUDE_GLOBS = [ | |
"**/*.rb", | |
] | |
EXCLUDE_GLOBS = [ | |
"config/**/*", | |
"db/**/*", | |
"**/test/**/*", | |
"**/spec/**/*", | |
#"**/graph{ql,_api}/**/*", | |
] | |
SINCE = "2 months ago" | |
FILE_COUNT_THRESHOLD = 15 | |
file = ARGV[0] | |
git_client = GitClient.new(Dir.pwd) | |
ruby_methods = Extractors::RubyMethods.new | |
git_differ = Extractors::GitDiff.new(git_client) | |
current_methods = ruby_methods.methods(File.read(file)).map(&:name).to_set | |
pull_requests = Extractors::Git.new(git_client: git_client, since: SINCE).extract | |
pull_requests.each do |pr| | |
next if pr.files.length > FILE_COUNT_THRESHOLD | |
affected_methods = pr.commits.flat_map do |commit| | |
next [] unless commit.changed_files.include?(file) | |
diffs = git_differ.diffs_for(commit.sha) | |
file_diff = diffs.find { |diff| diff.name_after == file } | |
file_contents = git_client.call("show", "#{commit.sha}:#{file}") | |
methods = ruby_methods.methods(file_contents) | |
# So what we're going to do is go over the chunks and match the lines getting added | |
# to the detected methods. This will roughly let us know what's getting changed. | |
methods.select do |method| | |
next unless current_methods.include?(method.name) | |
file_diff.chunks.any? do |chunk| | |
ranges_overlap?(method.range, chunk.after) | |
end | |
end | |
end | |
components = pr.files.map do |file| | |
next unless file.start_with?("components") | |
file.split("/", 3)[1] | |
end | |
unless affected_methods.empty? | |
puts pr.key | |
puts " Methods:" | |
puts affected_methods.map(&:name).uniq.sort.map { |x| " -#{x}" } | |
puts "" | |
puts " Components:" | |
puts components.compact.uniq.sort.map { |x| " -#{x}" } | |
puts "" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment