Last active March 19, 2021 17:03
#!/usr/bin/env python3
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from urllib.request import urlopen, Request
import json
import enum
import os
class IssueState(enum.Enum):
OPEN = "open"
CLOSED = "closed"
ALL = "all"
def __str__(self):
return self.value
class GitCommit:
commit_hash: str
title: str
body: str
author: str
author_date: datetime
commit_date: Optional[datetime]
def __init__(self, commit_hash: str, author: str, author_date: datetime, title: str, body: str, commit_date: Optional[datetime] = None) -> None:
self.commit_hash = commit_hash = author
self.author_date = author_date
self.commit_date = commit_date
self.title = title
self.body = body
def __contains__(self, item: Any) -> bool:
return item in self.body or item in self.title
def get_revert_revision(commit: GitCommit) -> Optional[str]:
import re
rc = re.match("Revert (D\d+):", commit.title)
if rc is None:
return None
def get_diff_revision(commit: GitCommit) -> Optional[str]:
import re
rc ="\s*Differential Revision: (D\d+)", commit.body)
if rc is None:
return None
def is_revert(commit: GitCommit) -> bool:
return get_revert_revision(commit) is not None
def parse_medium_format(lines: Union[str, List[str]]) -> GitCommit:
Expect commit message generated using `--format=medium --date=unix` format, i.e.:
commit <sha1>
Author: <author>
Date: <author date>
<title line>
<full commit message>
if isinstance(lines, str):
lines = lines.split("\n")
# TODO: Handle merge commits correctly
if len(lines) > 1 and lines[1].startswith("Merge:"):
del lines[1]
assert len(lines) > 5
assert lines[0].startswith("commit")
assert lines[1].startswith("Author: ")
assert lines[2].startswith("Date: ")
assert len(lines[3]) == 0
return GitCommit(
author=lines[1].split(":", 1)[1].strip(),
author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())),
def parse_fuller_format(lines: Union[str, List[str]]) -> GitCommit:
Expect commit message generated using `--format=fuller --date=unix` format, i.e.:
commit <sha1>
Author: <author>
AuthorDate: <author date>
Commit: <committer>
CommitDate: <committer date>
<title line>
<full commit message>
if isinstance(lines, str):
lines = lines.split("\n")
# TODO: Handle merge commits correctly
if len(lines) > 1 and lines[1].startswith("Merge:"):
del lines[1]
assert len(lines) > 7
assert lines[0].startswith("commit")
assert lines[1].startswith("Author: ")
assert lines[2].startswith("AuthorDate: ")
assert lines[3].startswith("Commit: ")
assert lines[4].startswith("CommitDate: ")
assert len(lines[5]) == 0
return GitCommit(
author=lines[1].split(":", 1)[1].strip(),
author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())),
commit_date=datetime.fromtimestamp(int(lines[4].split(":", 1)[1].strip())),
class GitRepo:
def __init__(self, path, remote='upstream'):
self.repo_dir = path
self.remote = remote
def _run_git_log(self, revision_range):
from subprocess import check_output
log = check_output(["git", "-C", self.repo_dir, "log", '--format=fuller', '--date=unix', revision_range]).decode("utf-8").split("\n")
rc = []
cur_msg = []
for line in log:
if line.startswith("commit"):
if len(cur_msg) > 0:
cur_msg = []
if len(cur_msg) > 0:
return rc
def get_commit_list(self, from_ref, to_ref) -> List[GitCommit]:
return self._run_git_log(f"{self.remote}/{from_ref}..{self.remote}/{to_ref}")
def build_commit_dict(commits: List[GitCommit]) -> Dict[str, GitCommit]:
rc = {}
for commit in commits:
assert commit.commit_hash not in rc
rc[commit.commit_hash] = commit
return rc
def fetch_json(url: str, params: Dict[str, Any] = {}) -> List[Dict[str, Any]]:
headers = {'Accept': 'application/vnd.github.v3+json'}
if len(params) >0:
url += '?' + '&'.join(f"{name}={val}" for name, val in params.items())
with urlopen(Request(url, headers=headers)) as data:
return json.load(data)
def fetch_multipage_json(url: str, params: Dict[str, Any] = dict()) -> List[Dict[str, Any]]:
assert "page" not in params
page_idx, rc, prev_len, params = 1, [], -1, params.copy()
while len(rc) > prev_len:
prev_len = len(rc)
params["page"] = page_idx
page_idx += 1
rc += fetch_json(url, params)
return rc
def gh_get_milestones(org='pytorch', project='pytorch', state: IssueState = IssueState.OPEN) -> List[Dict[str, Any]]:
url = f'{org}/{project}/milestones'
return fetch_multipage_json(url, {"state": state})
def gh_get_milestone_issues(org: str, project: str, milestone_idx: int, state: IssueState = IssueState.OPEN):
url = f'{org}/{project}/issues'
return fetch_multipage_json(url, {"milestone": milestone_idx, "state": state})
def gh_get_ref_statuses(org: str, project: str , ref: str) -> Dict[str, Any]:
url = f'{org}/{project}/commits/{ref}/status'
params = {"page": 1, "per_page": 100}
nrc = rc = fetch_json(url, params)
while "statuses" in nrc and len(nrc["statuses"]) == 100:
params["page"] += 1
nrc = fetch_json(url, params)
if "statuses" in nrc:
rc["statuses"] += nrc["statuses"]
return rc
def extract_statuses_map(json: Dict[str, Any]):
return {s["context"]: s["state"] for s in json["statuses"]}
def print_monthly_stats(commits: List[GitCommit]):
y,m, total, reverts,authors = None, None, 0, 0, set()
for commit in commits:
commit_date = commit.commit_date if commit.commit_date is not None else commit.author_date
if y != commit_date.year or m != commit_date.month:
if y != None:
print(f"{y}-{m:02d}: commits {total} reverts {reverts} ratio {100.0 * reverts / total :.2f}% authors {len(authors)}");
(y,m, total, reverts,authors) = commit_date.year, commit_date.month, 0, 0, set()
if is_revert(commit):
reverts += 1
total += 1
def analyze_reverts(commits: List[GitCommit]):
for idx, commit in enumerate(commits):
revert_id = get_revert_revision(commit)
if revert_id is None:
orig_commit = None
for i in range(1, 100):
orig_commit = commits[idx + i]
if get_diff_revision(orig_commit) == revert_id:
if orig_commit is None:
print(f"Failed to find original commit for {commit.title}")
print(f"{commit.commit_hash} is a revert of {orig_commit.commit_hash}: {orig_commit.title}")
revert_statuses = gh_get_ref_statuses("pytorch", "pytorch", commit.commit_hash)
orig_statuses = gh_get_ref_statuses("pytorch", "pytorch", orig_commit.commit_hash)
orig_sm = extract_statuses_map(orig_statuses)
revert_sm = extract_statuses_map(revert_statuses)
for k in revert_sm.keys():
if k not in orig_sm:
if orig_sm[k] != revert_sm[k]:
print(f"{k} {orig_sm[k]}->{revert_sm[k]}")
if __name__ == "__main__":
repo = GitRepo(os.path.expanduser("~/git/pytorch/pytorch"), "upstream")
x = repo._run_git_log("upstream/master")
if __name__ == "__main1__":
repo = GitRepo(os.path.expanduser("~/git/pytorch/pytorch"), "upstream")
master_commits = build_commit_dict(repo.get_commit_list("orig/release/1.8", "master"))
release_commits = build_commit_dict(repo.get_commit_list("orig/release/1.8", "release/1.8"))
for issue in gh_get_milestone_issues('pytorch', 'pytorch', 21, IssueState.ALL):
html_url, state = issue["html_url"], issue["state"]
# Skip closed states if they were landed before merge date
if state == "closed":
mentioned_after_cut = any(html_url in commit_message for commit_message in master_commits.values())
# If issue is not mentioned after cut, that it must be present in release branch
if not mentioned_after_cut:
mentioned_in_release = any(html_url in commit_message for commit_message in release_commits.values())
# if Issue is mentioned is release branch, than it was picked already
if mentioned_in_release:
