Skip to content

Instantly share code, notes, and snippets.

@rinov
Created August 22, 2023 07:59
Show Gist options
  • Save rinov/5a28c1aa547c0ac0831d984216d45c52 to your computer and use it in GitHub Desktop.
Save rinov/5a28c1aa547c0ac0831d984216d45c52 to your computer and use it in GitHub Desktop.
Github acitivity metrics for productivity
import os
import requests
import yaml
import json
import csv
import japanize_matplotlib # For japanese font
import functools
import seaborn as sns
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from dateutil.parser import parse as date_parse
from dataclasses import dataclass
"""
Github Acitivity Analyzer
- PRの最初のコミットからレビューリクエストまでの時間 (first commit to review request)
- レビューリクエストから最初のレビューがつくまでの時間 (review request to first review)
- レビューリクエストからマージまでの時間 (review request to merge)
- レビューリクエスト後の修正回数 (number of fix in review)
- PRの変更行数 (pr size)
- PRのコメント総数 (total comments)
- PRがマージされてからリリースされるまでにかかった時間 (deploy time)
"""
@dataclass
class GitHubPRAnalyzer:
# 環境変数に設定したGithubのPersonal Access Tokenを取得します
GITHUB_TOKEN = os.getenv("GITHUB_PERSONAL_ACCESS_TOKENS")
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}
BASE_URL = "https://api.github.com/repos"
@staticmethod
@functools.lru_cache
def get_json(url):
print("request:", url)
response = requests.get(url, headers=GitHubPRAnalyzer.HEADERS)
if response.status_code != 200:
return None
return response.json()
@staticmethod
def get_timestamp_from_date(date_string):
return date_parse(date_string).timestamp() if date_string else None
@staticmethod
def get_url(owner, repo, endpoint, pr_number=None):
return (
f"{GitHubPRAnalyzer.BASE_URL}/{owner}/{repo}/{endpoint}/{pr_number}"
if pr_number
else f"{GitHubPRAnalyzer.BASE_URL}/{owner}/{repo}/{endpoint}"
)
@staticmethod
@functools.lru_cache
def get_tag_date(owner, repo, tag_name):
tag_url = f"https://api.github.com/repos/{owner}/{repo}/git/refs/tags/{tag_name}"
print("request:", tag_url)
tag_response = requests.get(tag_url)
tag_date = datetime.strptime(
tag_response.json()["object"]["date"], "%Y-%m-%dT%H:%M:%SZ"
)
return tag_date
@staticmethod
def get_pr_metrics(pr_json, commits_json):
review_comments = pr_json["review_comments"]
pr_size = pr_json["additions"] + pr_json["deletions"]
file_changes = pr_json["changed_files"]
number_of_fix_in_review = sum(
1
for commit in commits_json
if (
GitHubPRAnalyzer.get_timestamp_from_date(
commit["commit"]["author"]["date"]
)
- GitHubPRAnalyzer.get_timestamp_from_date(
pr_json["created_at"]
)
)
> 0
)
return review_comments, pr_size, file_changes, number_of_fix_in_review
@staticmethod
@functools.lru_cache
def get_timeline_events(owner, repo, pr_number):
url = f"https://api.github.com/repos/{owner}/{repo}/issues/{pr_number}/timeline"
headers = {"Accept": "application/vnd.github+json"}
headers.update(GitHubPRAnalyzer.HEADERS)
print("request:", url)
response = requests.get(url, headers=headers)
return response.json() if response.status_code == 200 else None
@staticmethod
def get_pr_times(pr_json, commits_json, reviews_json, timeline_events):
merged_at = GitHubPRAnalyzer.get_timestamp_from_date(
pr_json["merged_at"]
)
first_commit_at = GitHubPRAnalyzer.get_timestamp_from_date(
commits_json[0]["commit"]["author"]["date"]
)
if timeline_events is not None:
review_request_events = [
event
for event in timeline_events
if event["event"] == "review_requested"
]
first_review_request_at = (
GitHubPRAnalyzer.get_timestamp_from_date(
review_request_events[0]["created_at"]
)
if review_request_events
else None
)
else:
first_review_request_at = None
first_review_at = None
author = pr_json["user"]["login"]
reviews_json = sorted(reviews_json, key=lambda x: x["submitted_at"])
# Ignore users who are not reviewers
ignore_users = ["renovate"]
for review in reviews_json:
if (
review["user"]["login"] != author
and review["user"]["login"] not in ignore_users
and (
review["state"] == "COMMENTED"
or review["state"] == "APPROVED"
or review["state"] == "CHANGES_REQUESTED"
)
):
first_review_at = GitHubPRAnalyzer.get_timestamp_from_date(
review["submitted_at"]
)
break
review_request_to_merge = (
merged_at - first_review_request_at
if merged_at and first_review_request_at
else None
)
first_commit_to_review_request = (
first_review_request_at - first_commit_at
if first_review_request_at and first_commit_at
else None
)
review_request_to_first_review = (
abs(first_review_at - first_review_request_at)
if first_review_at and first_review_request_at
else None
)
if first_review_request_at:
time_differences = [
GitHubPRAnalyzer.get_timestamp_from_date(
commit["commit"]["author"]["date"]
)
- first_review_request_at
for commit in commits_json
if GitHubPRAnalyzer.get_timestamp_from_date(
commit["commit"]["author"]["date"]
)
> first_review_request_at
and commit["author"] is not None
and commit["author"]["login"] == author
]
# レビュー依頼以降の変更・修正にかかった時間の平均
time_of_fix_to_change = (
sum(time_differences) / len(time_differences)
if time_differences
else 0
)
else:
time_of_fix_to_change = 0
return (
first_commit_to_review_request,
review_request_to_first_review,
review_request_to_merge,
time_of_fix_to_change,
first_review_request_at,
)
@staticmethod
def get_pr_info(owner, repo, pr_number, tags):
pr_url = GitHubPRAnalyzer.get_url(owner, repo, "pulls", pr_number)
pr_json = GitHubPRAnalyzer.get_json(pr_url)
if pr_json is None:
return None
if not pr_json["merged"]:
return None
author = pr_json["user"]["login"]
commits_json = GitHubPRAnalyzer.get_json(pr_json["commits_url"])
reviews_url = (
GitHubPRAnalyzer.get_url(owner, repo, "pulls", pr_number)
+ "/reviews"
)
reviews_json = GitHubPRAnalyzer.get_json(reviews_url)
first_commit_at = datetime.strptime(
commits_json[0]["commit"]["author"]["date"], "%Y-%m-%dT%H:%M:%SZ"
)
merged_at = datetime.strptime(
pr_json["merged_at"], "%Y-%m-%dT%H:%M:%SZ"
)
deploy_time = 0
# Note: deploy_timeの計測は直近のtagのcommitからmerged_atまで遡って特定するためかなりのリクエストが必要になる
# tagのtimestampで2分探索 + メモ化しないと待機時間が長すぎてしまう
for tag in reversed(tags):
tag_commit_hash = tag["commit"]
tag_url = "https://api.github.com/repos/{}/{}/commits/{}".format(
owner, repo, tag_commit_hash
)
tag_date = GitHubPRAnalyzer.get_json(tag_url)["commit"]["author"][
"date"
]
# first commitからmerged_at以降の最初にタグが作成されるまでの時間をdeploy_timeとする
tag_date = datetime.strptime(tag_date, "%Y-%m-%dT%H:%M:%SZ")
if tag_date >= merged_at:
deploy_time = (tag_date - first_commit_at).total_seconds()
break
timeline_events = GitHubPRAnalyzer.get_timeline_events(
owner, repo, pr_number
)
review_comments = pr_json["review_comments"]
pr_size = pr_json["additions"] + pr_json["deletions"]
file_changes = pr_json["changed_files"]
(
first_commit_to_review_request,
review_request_to_first_review,
review_request_to_merge,
time_of_fix_to_change,
first_review_request_at,
) = GitHubPRAnalyzer.get_pr_times(
pr_json, commits_json, reviews_json, timeline_events
)
# レビューリクエスト以降にコミットされた修正の数
if first_review_request_at:
number_of_fix_in_review = sum(
1
for commit in commits_json
if GitHubPRAnalyzer.get_timestamp_from_date(
commit["commit"]["author"]["date"]
)
> first_review_request_at
)
else:
number_of_fix_in_review = 0
return {
"author": author,
"url": pr_url,
"first_commit_to_review_request": first_commit_to_review_request,
"review_request_to_first_review": review_request_to_first_review,
"review_request_to_merge": review_request_to_merge,
"number_of_fix_in_review": number_of_fix_in_review,
"time_of_fix_to_change": time_of_fix_to_change,
"pr_size": pr_size,
"changed_files": file_changes,
"review_comments": review_comments,
"deploy_time": deploy_time,
}
@staticmethod
def search_pull_requests(
owner, repo, start_date, end_date, keywords, authors
):
author_query = " ".join([f"author:{author}" for author in authors])
query = (
f"repo:{owner}/{repo} type:pr is:closed {author_query} "
+ " ".join(f"{keyword}" for keyword in keywords)
)
query += f" created:{start_date.strftime('%Y-%m-%d')}..{end_date.strftime('%Y-%m-%d')}"
search_url = f"https://api.github.com/search/issues?q={query}"
pr_numbers = []
while search_url:
response = requests.get(
search_url, headers=GitHubPRAnalyzer.HEADERS
)
response.raise_for_status()
pr_json = response.json()
pr_numbers.extend(pr["number"] for pr in pr_json["items"])
if "next" in response.links:
search_url = response.links["next"]["url"]
else:
search_url = None
return pr_numbers
@staticmethod
def save_to_yaml(owner, repo, pr_numbers, file_path):
data = {"owner": owner, "repo": repo, "pull_requests": pr_numbers}
with open(file_path, "w") as file:
yaml.safe_dump(data, file)
@staticmethod
def plot_statistics(file_path, columns_to_convert, exclude_columns):
column_units = {
"first_commit_to_review_request": "分",
"review_request_to_first_review": "分",
"review_request_to_merge": "分",
"time_of_fix_to_change": "分",
"pr_size": "行",
"changed_files": "ファイル",
"review_comments": "件",
"deploy_time": "分",
}
data = pd.read_csv(file_path)
del data["deploy_time"]
for column in columns_to_convert:
data[column] = data[column] / 60
other_columns = [
col
for col in data.columns
if col not in columns_to_convert + exclude_columns
]
all_columns = columns_to_convert + other_columns
fig, axes = plt.subplots(2, 4, figsize=(16, 5))
fig.suptitle("生産性メトリクスの可視化", fontsize=16)
sns.set_palette("pastel")
for i, column in enumerate(all_columns[:8]):
ax = axes[i // 4, i % 4]
sns.histplot(data[column], bins=30, ax=ax)
ax.set_title(column)
ax.set_ylabel("件数")
mean_value = data[column].mean()
median_value = data[column].median()
unit = column_units.get(column, "")
ax.text(
0.5,
0.85,
f"平均: {int(mean_value)} {unit}",
transform=ax.transAxes,
)
ax.text(
0.5,
0.7,
f"中央値: {int(median_value)} {unit}",
transform=ax.transAxes,
)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
def main():
# Githubの組織名
owner = "rinov"
# 対象のリポジトリ名
repo = "metrics"
# 期間を設定する
start_date = dt.datetime(2023, 7, 1)
end_date = dt.datetime(2023, 8, 1)
analyzer = GitHubPRAnalyzer()
# authorを絞る場合にはauthorsを設定する
authors = []
# PRを絞る場合にはkeywardsを設定する
keywards = []
pr_numbers = analyzer.search_pull_requests(
owner,
repo,
start_date,
end_date,
keywords=[],
authors=[],
)
print(f"{len(pr_numbers)}件のPRが見つかりました。")
if not pr_numbers:
return
analyzer.save_to_yaml(owner, repo, pr_numbers, f"{repo}_pull_requests.yml")
print("PRの情報を取得します")
pr_infos = [
analyzer.get_pr_info(owner, repo, pr_number, tags=[])
for pr_number in pr_numbers
]
json_results = json.dumps(pr_infos, indent=4, default=str)
print(json_results)
with open(f"{repo}_pr_infos.json", "w") as file:
file.write(json_results)
csv_headers = [
"author",
"url",
"first_commit_to_review_request",
"review_request_to_first_review",
"review_request_to_merge",
"number_of_fix_in_review",
"time_of_fix_to_change",
"pr_size",
"changed_files",
"review_comments",
"deploy_time",
]
with open(f"{repo}_pr_infos.csv", "w") as file:
writer = csv.DictWriter(file, fieldnames=csv_headers)
writer.writeheader()
pr_infos = [pr_info for pr_info in pr_infos if pr_info is not None]
writer.writerows(pr_infos)
analyzer.plot_statistics(
file_path=f"./{repo}_pr_infos.csv",
columns_to_convert=[
"first_commit_to_review_request",
"review_request_to_first_review",
"review_request_to_merge",
"time_of_fix_to_change",
],
exclude_columns=["author", "url", "deploy_time"],
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment