Skip to content

Instantly share code, notes, and snippets.

@thegedge
Created January 9, 2020 15:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thegedge/f92abf2ab435402e5263cc34ca8f6f6b to your computer and use it in GitHub Desktop.
Save thegedge/f92abf2ab435402e5263cc34ca8f6f6b to your computer and use it in GitHub Desktop.
Standalone script to read git history, and render a graph showing files that change together
#!/usr/bin/ruby --disable-gems
#
# # Prerequisites
#
# - ghostscript
# - graphviz
#
# # Example usage
#
# graph_mutual_changes --clustering=package --since="6 months ago" && open graph.pdf
#
require "json"
require "open3"
require "optparse"
require "pathname"
require "set"
require "time"
require "tmpdir"
module Extractors
FileGrouping = Struct.new("FileGrouping", :key, :files, :weight)
class Git
def initialize(git_dir:, since:)
@git_dir = git_dir
@since = since
end
def extract
pull_requests_files
end
private
CommitData = Struct.new(
"CommitData",
:author_date, :sha, :subject, :first_parent_sha, :second_parent_sha, :changed_files
)
private_constant :CommitData
COMMIT_DATA_SEPARATOR = "--->"
private_constant :COMMIT_DATA_SEPARATOR
SUBJECT_SEPARATOR = ">>>"
private_constant :SUBJECT_SEPARATOR
MERGE_COMMIT_PR_REGEX = /Merge pull request #(\d+) from/
private_constant :MERGE_COMMIT_PR_REGEX
def pull_requests_files
now = Time.now
oldest = now - master_commits.entries.last.last.author_date
youngest = now - master_commits.entries.first.last.author_date
range = (oldest - youngest).to_f
master_commits.map do |_sha, master_commit|
weight = 1 - ((now - master_commit.author_date) / range)
FileGrouping.new(pr_number(master_commit), branch_files(master_commit), weight)
end
end
private
attr_reader :since
def pr_number(commit)
match = commit.subject.match(MERGE_COMMIT_PR_REGEX)
return nil unless match
match[1]
end
def branch_files(merge_commit)
files = merge_commit.changed_files.dup
# Traverse backwards through the branch until we return to master
branch_commits = []
commit = commits[merge_commit.second_parent_sha]
while commit && !master_commits.key?(commit.sha)
files += commit.changed_files
commit = commits[commit.first_parent_sha]
end
files.uniq!
files
end
def master_commits
@master_commits ||= begin
master_commits = {}
# Traverse the first parent of every merge commit (similar to the --first-parent switch
# in some git commands). This should eventually end at the root commit.
_, commit = commits.first
while commit
master_commits[commit.sha] = commit
# First parent of a merge commit is on the same branch
commit = commits[commit.first_parent_sha]
end
master_commits
end
end
def commits
@commits ||= begin
format_arg = "--format=format:#{COMMIT_DATA_SEPARATOR}%aI %H %P #{SUBJECT_SEPARATOR} %s"
since_arg = "--since=\"#{since}\""
branch_arg = "origin/master"
commit_data = git("log", "--name-status", format_arg, since_arg, branch_arg).split(COMMIT_DATA_SEPARATOR)
# Keep track of files that are deleted. There's no need to keep them for consideration, so
# we'll drop them if we encounter them again. Also, since we're going in chronologically
# descending order, we should capture everything in a single pass.
deleted_files = Set.new
# First entry will always be empty, because there's a separator at the beginning of the file
commit_data.drop(1).each_with_object({}) do |data, commits|
lines = data.lines.map(&:chomp)
header, subject = lines.first.split(SUBJECT_SEPARATOR)
changed_files = Array(lines[1..-1]).map do |line|
next nil if line.empty?
status, file1, file2 = line.split
file = case status
when 'D'
deleted_files << file1
nil
when /R\d+/
deleted_files << file1
file2
else
file1
end
if deleted_files.include?(file)
nil
else
file
end
end
author_date, sha, first_parent_sha, second_parent_sha = header.split
commits[sha] = CommitData.new(
Time.iso8601(author_date),
sha,
subject,
first_parent_sha,
second_parent_sha,
changed_files.compact
)
end
end
end
def git(*args)
stdout, stderr, status = Open3.capture3("git", "-C", @git_dir, *args)
return stdout if status.success?
STDERR.puts("Failed to run git command: #{args.join(" ")}")
STDERR.puts(stderr)
exit status.to_i
end
end
end
class Graph
attr_reader :edges
def initialize(edges)
@edges = edges.freeze
end
def nodes
@nodes ||= edges.keys.flat_map(&:itself).uniq
end
end
module Transformers
class ExcludeNonExistantFiles
def filter!(data)
current_files = INCLUDE_GLOBS.flat_map { |glob| Dir[glob] }
current_files = Set.new(current_files)
data.each do |file_grouping|
filtered_files = file_grouping.files.select! { |file| current_files.include?(file) }
end
end
end
class FileGlobFilter
def initialize(include_globs: INCLUDE_GLOBS, exclude_globs: EXCLUDE_GLOBS)
@include_globs = include_globs
@exclude_globs = exclude_globs
end
def filter!(data)
opts = File::FNM_PATHNAME | File::FNM_EXTGLOB
data.each do |file_grouping|
file_grouping.files.select! do |file|
@include_globs.any? { |glob| File.fnmatch?(glob, file, opts) }
end
end
data.each do |file_grouping|
file_grouping.files.reject! do |file|
@exclude_globs.any? { |glob| File.fnmatch?(glob, file, opts) }
end
end
end
end
class FileChangesToGraph
def initialize(max_changed)
@max_changed = max_changed
end
def transform(pull_request_files)
edges = pull_request_files.each_with_object({}) do |file_grouping, edges|
next if file_grouping.files.length > @max_changed
file_grouping.files.combination(2).each do |file1, file2|
key = [file1, file2]
key = [key.min, key.max]
edges[key] ||= 0
edges[key] += file_grouping.weight
end
end
Graph.new(edges)
end
end
class FindSubgraphs
def transform(graph)
node_to_subgraph = {}
subgraphs = graph.edges.each_with_object([]) do |(edge, weight), subgraphs|
a, b = edge
a_subgraph = node_to_subgraph[a]
b_subgraph = node_to_subgraph[b]
if a_subgraph
a_subgraph[edge] = weight
node_to_subgraph[b] = a_subgraph
elsif b_subgraph
b_subgraph[edge] = weight
node_to_subgraph[a] = b_subgraph
else
subgraph = { edge => weight }
node_to_subgraph[a] = subgraph
node_to_subgraph[b] = subgraph
subgraphs << subgraph
end
end
subgraphs.sort_by!(&:length)
subgraphs.reverse!
subgraphs.map! { |edges| Graph.new(edges) }
subgraphs
end
end
class RemoveSmallSubgraphs
def initialize(threshold)
@threshold = threshold
end
def transform(graphs)
graphs.select do |graph|
graph.nodes.length >= @threshold
end
end
end
class NormalizeGraphWeights
def transform(graphs)
graphs.map do |graph|
min_edge_weight = graph.edges.values.min
max_edge_weight = graph.edges.values.max
edges = graph.edges.map do |edge, weight|
weight = if min_edge_weight == max_edge_weight
0.5
else
normalizing_factor = 1 / (max_edge_weight - min_edge_weight)
normalizing_factor * (weight - min_edge_weight)
end
[edge, weight]
end
Graph.new(edges.to_h)
end
end
end
end
module Clustering
class None
def cluster(graph)
{ nil => graph.nodes }
end
end
class ByDirectory
def initialize(max_depth)
@max_depth = max_depth
end
def cluster(graph)
graph.nodes.group_by do |node|
File.dirname(node).split("/", @max_depth + 1).take(@max_depth).join("/")
end
end
end
class ByPackage
UNKNOWN_PACKAGE_NAME = "other"
def initialize(exclude_unknown: false)
@package_paths = Dir["**/package.yml"]
.map { |path| File.dirname(path) }
.sort_by(&:length)
@exclude_unknown = exclude_unknown
end
def cluster(graph)
graph.nodes.group_by { |node| cluster_for(node) }.tap do |clusters|
clusters.delete(UNKNOWN_PACKAGE_NAME) if @exclude_unknown
end
end
private
def cluster_for(node)
package = @package_paths.find { |path| node.start_with?(path) }
package || UNKNOWN_PACKAGE_NAME
end
end
end
module Loaders
class Graphviz
def initialize(graph, clustering: Clustering::None.new)
@graph = graph
@node_aliases = build_node_aliases(graph)
@clustering = clustering
# HSV pastelle color palette
hsvs = (1..19).map do |x|
[
(x*11 % 19) / 19.0, # make consecutive hues be reasonably distant from each other
0.05 + (rand * 0.1),
0.80 + (rand * 0.15),
]
end
@cluster_color_list = hsvs
.map { |h, s, v| hsv2rgb(h, s ,v) }
.cycle
end
def render_to(io = STDOUT)
io.puts("graph {")
io.puts(" overlap = false;")
io.puts(" outputorder = edgesfirst;")
io.puts(" model = mds;")
io.puts(" node [shape=rectangle style=\"filled, rounded\"];")
io.puts("")
edges = graph.edges.dup
cluster_to_color = {}
node_to_cluster = {}
clusters = clustering.cluster(graph)
clusters.each do |cluster_name, nodes|
cluster_color = @cluster_color_list.next
cluster_to_color[cluster_name] = cluster_color
# TODO hack to not draw subgraph if just a single cluster
if clusters.length > 1
io.puts(" subgraph cluster_#{cluster_name.gsub(/[^A-Za-z0-9_]/, '_')} {")
io.puts(" margin = 50;")
io.puts(" style = invis;")
# These currently aren't rendered because style = invis above.
# TODO perhaps make the subgraph style an option?
io.puts(" color = \"#{cluster_color}55\";")
io.puts(" pencolor = \"#00000055\";")
io.puts(" penwidth = 5;")
io.puts(" label = \"#{cluster_name}\";")
io.puts(" labeljust = l;")
io.puts(" fontsize = 64;")
io.puts(" fontname = \"times bold\";")
io.puts("")
end
io.puts(" node [fillcolor=\"#{cluster_color}\"];")
nodes.each do |node|
node_to_cluster[node] = cluster_name
node_alias = node_aliases[node]
io.puts(" #{node_alias} [#{node_attributes(node)}];")
end
io.puts("")
edges = edges.delete_if do |(a, b), weight|
next false if !nodes.include?(a) || !nodes.include?(b)
a_name = node_aliases[a]
b_name = node_aliases[b]
io.puts(" #{a_name} -- #{b_name} [#{edge_attributes(a, b, weight, cluster_color)}];")
true
end
io.puts(" }") if clusters.length > 1
io.puts("")
end
edges = edges.each do |(a, b), weight|
a_cluster = node_to_cluster[a]
b_cluster = node_to_cluster[b]
next unless a_cluster && b_cluster
a_name = node_aliases[a]
b_name = node_aliases[b]
a_color = cluster_to_color[a_cluster]
b_color = cluster_to_color[b_cluster]
color = "#{b_color};0.5:#{a_color}"
io.puts(" #{a_name} -- #{b_name} [#{edge_attributes(a, b, weight, color)}];")
end
io.puts("}")
end
private
attr_reader :graph, :node_aliases, :clustering
def build_node_aliases(graph)
node_name = "A"
graph.nodes.each_with_object({}) do |node, map|
map[node] = node_name
node_name = node_name.next
end
end
def node_attributes(node)
%(label="#{node}")
end
def edge_attributes(a, b, weight, color)
length = Math.exp(1 - weight) - 0.9
weight = Math.exp(3*weight) - 1
%(weight=#{weight.round(2)} len=#{length.round(2)} color="#{color}")
end
def hsv2rgb(h, s, v)
h = (h * 360) % 360
c = v*s
x = c*(1 - ((h / 60) % 2 - 1).abs)
m = v - c
rgb = case h
when 0...60
[c, x, 0]
when 60...120
[x, c, 0]
when 120...180
[0, c, x]
when 180...240
[0, x, c]
when 240...300
[x, 0, c]
when 300...360
[c, 0, x]
end
r, g, b = rgb.map { |v| to_hex(255 * (v + m)) }
"##{r}#{g}#{b}"
end
def to_hex(value)
value.to_i.to_s(16).rjust(2, '0')
end
end
end
# TODO parameterize these constants
INCLUDE_GLOBS = [
"**/*.rb",
]
EXCLUDE_GLOBS = [
"config/**/*",
"db/**/*",
"**/test/**/*",
"**/spec/**/*",
#"**/graph{ql,_api}/**/*",
]
options = {
history_since: "3 months ago",
max_changed: 25,
minimum_graph_size: 10,
max_graphs: 25,
exclude_unknown: false,
clustering: "package",
output_graphs: true,
output_directory: Pathname.pwd,
output_format: "pdf",
graphviz_command: "neato",
}
option_parser = OptionParser.new do |opts|
opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
opts.on("--clustering=CLUSTERING", "Cluster by package, directory, or none (default: package)") do |v|
options[:clustering] = v
end
opts.on(
"--max-changed=N",
Integer,
"Do not include PRs that have more than this many files (default: 50)"
) do |v|
options[:max_changed] = v
end
opts.on( "--since=SINCE", "Only grab file data this far back in git history (default: 3 months ago)") do |v|
options[:history_since] = v
end
opts.on( "--min-graph-size=N", Integer, "Filter out graphs with fewer than this many nodes (default: 10)") do |v|
options[:history_since] = v
end
opts.on("--max-graphs=N", Integer, "Only output this many subgraphs (default: 25)") do |v|
options[:max_graphs] = v
end
opts.on("--output-directory=PATH", "Directory to emit files (default: tmp/packages)") do |v|
options[:output_directory] = v
end
opts.on("--output-format=FORMAT", "File type to emit (default: pdf)") do |v|
options[:output_format] = v
end
opts.on("--no-graph-output", "If specified, don't render any graphs ") do |v|
options[:output_graphs] = false
end
opts.on("--graphviz-command=CMD", "Graphviz command to use for emitting results (default: fdp)") do |v|
options[:graphviz_command] = v
end
opts.on("--exclude-unknown", "Exclude anything not in a package, if clustering by package") do |v|
options[:exclude_unknown] = v
end
end
args = ARGV.dup
option_parser.parse!(args)
files_to_show_prs_for = args
clusterer = case options[:clustering]
when "none"
Clustering::None.new
when "directory"
Clustering::ByDirectory.new(5)
when "package"
Clustering::ByPackage.new(exclude_unknown: options[:exclude_unknown])
else
abort("Unknown clustering option: #{options[:clustering]}")
end
#--------------------------------------------------------------------------------
# ETL pipeline:
#
# -- Extract --
#
# Get data that shows files that change together.
#
# Output :: Array[FileGrouping]
#
#
# -- Transform --
#
# Filtering followed by transformations. End result should be a list of graphs.
#
# Output :: Array[Graph]
#
#
# -- Loaders --
#
# Can output any graph format. Currently just graphviz.
#--------------------------------------------------------------------------------
file_groupings = Extractors::Git.new(git_dir: Dir.pwd, since: options.fetch(:history_since)).extract
file_groupings = Transformers::FileGlobFilter.new.filter!(file_groupings)
file_groupings = Transformers::ExcludeNonExistantFiles.new.filter!(file_groupings)
graph = Transformers::FileChangesToGraph.new(options.fetch(:max_changed)).transform(file_groupings)
subgraphs = Transformers::FindSubgraphs.new.transform(graph)
subgraphs = Transformers::RemoveSmallSubgraphs.new(options.fetch(:minimum_graph_size)).transform(subgraphs)
subgraphs = Transformers::NormalizeGraphWeights.new.transform(subgraphs)
subgraphs = subgraphs.take(options.fetch(:max_graphs))
abort "No data to render!" if subgraphs.empty?
if options.fetch(:output_graphs)
Dir.mktmpdir do |dir|
output_format = options.fetch(:output_format)
subgraphs.each_with_index do |graph, index|
output_file_path = File.join(dir, "graph_#{index.to_s.rjust(3, '0')}.#{output_format}")
args = [options.fetch(:graphviz_command), "-T", output_format, "-o", output_file_path]
Open3.popen3(*args) do |stdin, stdout, stderr, waiter_thread|
Loaders::Graphviz.new(graph, clustering: clusterer).render_to(stdin)
stdin.close
IO.copy_stream(stdout, STDOUT)
IO.copy_stream(stderr, STDERR)
abort unless waiter_thread.value.success?
end
end
output_directory = Pathname.new(options.fetch(:output_directory))
output_directory.mkpath
output_file = output_directory.join("graph.pdf")
abort unless system("gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=\"#{output_file}\" #{dir}/*.pdf")
end
end
unless files_to_show_prs_for.empty?
file_groupings.each do |file_grouping|
files = file_grouping.files
if files.any? { |f| files_to_show_prs_for.include?(f) }
# TODO perhaps parse origin remote to construct a URL to PR
puts(file_grouping.key)
files.each do |file|
if files_to_show_prs_for.include?(file)
# output green for a file that was chosen
puts "\t -\e[32m#{file}\e[0m"
else
puts("\t- #{file}")
end
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment