まず適当なディレクトリを作っておき、そこにすべてのリポジトリを clone しておく。(<owner>
は対象の owner or org を入れる)
gh search repos --owner=<owner> --archived=false --updated='>=2023-01-01' --json=url -L 500 | jq -r '.[].url' | xargs -P10 -I% git clone %
その後以下の Ruby スクリプトで、言語を推定・集計する。
require 'bundler/inline'
gemfile do
source 'https://rubygems.org'
gem 'github-linguist', require: 'linguist'
gem 'rugged'
gem 'ruby-progressbar'
gem 'parallel'
end
require 'pathname'
require 'date'
# @param [Rugged::Repository] repo
# @param [Date] date
# @return [String] commit hash
def find_latest_commit_at(repo, date)
main_branch = `git -C #{repo.path} branch --remote -l origin/master origin/main`.strip.split.last
`git -C #{repo.path} rev-list -n 1 --before='#{date}' #{main_branch}`.strip
end
# @param [Rugged::Repository] repo
# @param [String] commit hash
# @return [Time] commit date
def find_commit_date(repo, commit)
repo.lookup(commit).time
end
# @param [Rugged::Repository] repo
# @param [String] commit
# @return [Hash{String => Integer}] language => lines
def total_languages(repo, commit)
Linguist::Repository.new(repo, commit).languages
end
LANG_ALLOWLIST = %w[Ruby Go Python].freeze
def detect_as_others?(language)
!LANG_ALLOWLIST.include?(language)
end
repos = Pathname.glob('*/.git').select do |path|
repo = Rugged::Repository.new(path.parent)
# 直近 1 年以内に commit があるリポジトリのみ
find_commit_date(repo, find_latest_commit_at(repo, Date.today)) > (Date.today - 365).to_time
end
# 過去 5 年から 3 ヶ月ごと
dates = ((Date.today - 10*365)..Date.today).select { |d| d.day == 1 && d.month == 1 }
# [[repo, date, languages], [repo, date, languages], ...]
all_total = Parallel.map(repos.product(dates), progress: '') do |path, date|
repo = Rugged::Repository.new(path.parent)
commit = find_latest_commit_at(repo, date)
if commit.empty?
next
end
languages = total_languages(repo, commit)
# workaround: parallel gem がデフォルトオブジェクトが Proc である Hash をサポートしていない
languages.default_proc = nil
[path.parent.to_s, date, languages]
end.compact
# Hash<Hash{String => Integer}>: { date => { language => lines } }
data = all_total.each_with_object({}) do |(repo, date, languages), h|
languages.each do |language, lines|
h[date] ||= {}
h[date][language] ||= 0
h[date][language] += lines
end
end
languages = data.values.flat_map(&:keys).uniq.reject { |language| detect_as_others?(language) }.sort
# | | language | language | ...
# | date | lines | lines | ...
# | date | lines | lines | ...
# | ...
tsv = ([([''] + languages).join("\t")] + data.map do |date, line_maps|
([date.strftime('%Y-%m-%d')] + languages.map { |language| line_maps[language] || 0 }).join("\t")
end).join("\n")
File.write('data.tsv', tsv)