Last active
March 19, 2021 17:03
-
-
Save malfet/d34d3ef57c535696384246bbc79775c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from datetime import datetime | |
from typing import Any, Dict, List, Optional, Union | |
from urllib.request import urlopen, Request | |
import json | |
import enum | |
import os | |
class IssueState(enum.Enum): | |
OPEN = "open" | |
CLOSED = "closed" | |
ALL = "all" | |
def __str__(self): | |
return self.value | |
class GitCommit: | |
commit_hash: str | |
title: str | |
body: str | |
author: str | |
author_date: datetime | |
commit_date: Optional[datetime] | |
def __init__(self, commit_hash: str, author: str, author_date: datetime, title: str, body: str, commit_date: Optional[datetime] = None) -> None: | |
self.commit_hash = commit_hash | |
self.author = author | |
self.author_date = author_date | |
self.commit_date = commit_date | |
self.title = title | |
self.body = body | |
def __contains__(self, item: Any) -> bool: | |
return item in self.body or item in self.title | |
def get_revert_revision(commit: GitCommit) -> Optional[str]: | |
import re | |
rc = re.match("Revert (D\d+):", commit.title) | |
if rc is None: | |
return None | |
return rc.group(1) | |
def get_diff_revision(commit: GitCommit) -> Optional[str]: | |
import re | |
rc = re.search("\s*Differential Revision: (D\d+)", commit.body) | |
if rc is None: | |
return None | |
return rc.group(1) | |
def is_revert(commit: GitCommit) -> bool: | |
return get_revert_revision(commit) is not None | |
def parse_medium_format(lines: Union[str, List[str]]) -> GitCommit: | |
""" | |
Expect commit message generated using `--format=medium --date=unix` format, i.e.: | |
commit <sha1> | |
Author: <author> | |
Date: <author date> | |
<title line> | |
<full commit message> | |
""" | |
if isinstance(lines, str): | |
lines = lines.split("\n") | |
# TODO: Handle merge commits correctly | |
if len(lines) > 1 and lines[1].startswith("Merge:"): | |
del lines[1] | |
assert len(lines) > 5 | |
assert lines[0].startswith("commit") | |
assert lines[1].startswith("Author: ") | |
assert lines[2].startswith("Date: ") | |
assert len(lines[3]) == 0 | |
return GitCommit( | |
commit_hash=lines[0].split()[1].strip(), | |
author=lines[1].split(":", 1)[1].strip(), | |
author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), | |
title=lines[4].strip(), | |
body="\n".join(lines[5:]), | |
) | |
def parse_fuller_format(lines: Union[str, List[str]]) -> GitCommit: | |
""" | |
Expect commit message generated using `--format=fuller --date=unix` format, i.e.: | |
commit <sha1> | |
Author: <author> | |
AuthorDate: <author date> | |
Commit: <committer> | |
CommitDate: <committer date> | |
<title line> | |
<full commit message> | |
""" | |
if isinstance(lines, str): | |
lines = lines.split("\n") | |
# TODO: Handle merge commits correctly | |
if len(lines) > 1 and lines[1].startswith("Merge:"): | |
del lines[1] | |
assert len(lines) > 7 | |
assert lines[0].startswith("commit") | |
assert lines[1].startswith("Author: ") | |
assert lines[2].startswith("AuthorDate: ") | |
assert lines[3].startswith("Commit: ") | |
assert lines[4].startswith("CommitDate: ") | |
assert len(lines[5]) == 0 | |
return GitCommit( | |
commit_hash=lines[0].split()[1].strip(), | |
author=lines[1].split(":", 1)[1].strip(), | |
author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), | |
commit_date=datetime.fromtimestamp(int(lines[4].split(":", 1)[1].strip())), | |
title=lines[6].strip(), | |
body="\n".join(lines[7:]), | |
) | |
class GitRepo: | |
def __init__(self, path, remote='upstream'): | |
self.repo_dir = path | |
self.remote = remote | |
def _run_git_log(self, revision_range): | |
from subprocess import check_output | |
log = check_output(["git", "-C", self.repo_dir, "log", '--format=fuller', '--date=unix', revision_range]).decode("utf-8").split("\n") | |
rc = [] | |
cur_msg = [] | |
for line in log: | |
if line.startswith("commit"): | |
if len(cur_msg) > 0: | |
rc.append(parse_fuller_format(cur_msg)) | |
cur_msg = [] | |
cur_msg.append(line) | |
if len(cur_msg) > 0: | |
rc.append(parse_fuller_format(cur_msg)) | |
return rc | |
def get_commit_list(self, from_ref, to_ref) -> List[GitCommit]: | |
return self._run_git_log(f"{self.remote}/{from_ref}..{self.remote}/{to_ref}") | |
def build_commit_dict(commits: List[GitCommit]) -> Dict[str, GitCommit]: | |
rc = {} | |
for commit in commits: | |
assert commit.commit_hash not in rc | |
rc[commit.commit_hash] = commit | |
return rc | |
def fetch_json(url: str, params: Dict[str, Any] = {}) -> List[Dict[str, Any]]: | |
headers = {'Accept': 'application/vnd.github.v3+json'} | |
if len(params) >0: | |
url += '?' + '&'.join(f"{name}={val}" for name, val in params.items()) | |
with urlopen(Request(url, headers=headers)) as data: | |
return json.load(data) | |
def fetch_multipage_json(url: str, params: Dict[str, Any] = dict()) -> List[Dict[str, Any]]: | |
assert "page" not in params | |
page_idx, rc, prev_len, params = 1, [], -1, params.copy() | |
while len(rc) > prev_len: | |
prev_len = len(rc) | |
params["page"] = page_idx | |
page_idx += 1 | |
rc += fetch_json(url, params) | |
return rc | |
def gh_get_milestones(org='pytorch', project='pytorch', state: IssueState = IssueState.OPEN) -> List[Dict[str, Any]]: | |
url = f'https://api.github.com/repos/{org}/{project}/milestones' | |
return fetch_multipage_json(url, {"state": state}) | |
def gh_get_milestone_issues(org: str, project: str, milestone_idx: int, state: IssueState = IssueState.OPEN): | |
url = f'https://api.github.com/repos/{org}/{project}/issues' | |
return fetch_multipage_json(url, {"milestone": milestone_idx, "state": state}) | |
def gh_get_ref_statuses(org: str, project: str , ref: str) -> Dict[str, Any]: | |
url = f'https://api.github.com/repos/{org}/{project}/commits/{ref}/status' | |
params = {"page": 1, "per_page": 100} | |
nrc = rc = fetch_json(url, params) | |
while "statuses" in nrc and len(nrc["statuses"]) == 100: | |
params["page"] += 1 | |
nrc = fetch_json(url, params) | |
if "statuses" in nrc: | |
rc["statuses"] += nrc["statuses"] | |
return rc | |
def extract_statuses_map(json: Dict[str, Any]): | |
return {s["context"]: s["state"] for s in json["statuses"]} | |
def print_monthly_stats(commits: List[GitCommit]): | |
y,m, total, reverts,authors = None, None, 0, 0, set() | |
for commit in commits: | |
commit_date = commit.commit_date if commit.commit_date is not None else commit.author_date | |
if y != commit_date.year or m != commit_date.month: | |
if y != None: | |
print(f"{y}-{m:02d}: commits {total} reverts {reverts} ratio {100.0 * reverts / total :.2f}% authors {len(authors)}"); | |
(y,m, total, reverts,authors) = commit_date.year, commit_date.month, 0, 0, set() | |
if is_revert(commit): | |
reverts += 1 | |
total += 1 | |
authors.add(commit.author) | |
def analyze_reverts(commits: List[GitCommit]): | |
for idx, commit in enumerate(commits): | |
revert_id = get_revert_revision(commit) | |
if revert_id is None: | |
continue | |
orig_commit = None | |
for i in range(1, 100): | |
orig_commit = commits[idx + i] | |
if get_diff_revision(orig_commit) == revert_id: | |
break | |
if orig_commit is None: | |
print(f"Failed to find original commit for {commit.title}") | |
continue | |
print(f"{commit.commit_hash} is a revert of {orig_commit.commit_hash}: {orig_commit.title}") | |
revert_statuses = gh_get_ref_statuses("pytorch", "pytorch", commit.commit_hash) | |
orig_statuses = gh_get_ref_statuses("pytorch", "pytorch", orig_commit.commit_hash) | |
orig_sm = extract_statuses_map(orig_statuses) | |
revert_sm = extract_statuses_map(revert_statuses) | |
for k in revert_sm.keys(): | |
if k not in orig_sm: | |
continue | |
if orig_sm[k] != revert_sm[k]: | |
print(f"{k} {orig_sm[k]}->{revert_sm[k]}") | |
if __name__ == "__main__": | |
repo = GitRepo(os.path.expanduser("~/git/pytorch/pytorch"), "upstream") | |
x = repo._run_git_log("upstream/master") | |
print_monthly_stats(x) | |
#analyze_reverts(x) | |
if __name__ == "__main1__": | |
repo = GitRepo(os.path.expanduser("~/git/pytorch/pytorch"), "upstream") | |
master_commits = build_commit_dict(repo.get_commit_list("orig/release/1.8", "master")) | |
release_commits = build_commit_dict(repo.get_commit_list("orig/release/1.8", "release/1.8")) | |
print(f"len(master_commits)={len(master_commits)}") | |
print(f"len(release_commits)={len(release_commits)}") | |
print("URL;Title;Status") | |
for issue in gh_get_milestone_issues('pytorch', 'pytorch', 21, IssueState.ALL): | |
html_url, state = issue["html_url"], issue["state"] | |
# Skip closed states if they were landed before merge date | |
if state == "closed": | |
mentioned_after_cut = any(html_url in commit_message for commit_message in master_commits.values()) | |
# If issue is not mentioned after cut, that it must be present in release branch | |
if not mentioned_after_cut: | |
continue | |
mentioned_in_release = any(html_url in commit_message for commit_message in release_commits.values()) | |
# if Issue is mentioned is release branch, than it was picked already | |
if mentioned_in_release: | |
continue | |
print(f'{html_url};{issue["title"]};{state}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment