rebane2001/gen_data.py

## gen_data.py
import json
import os
import re
import git
import itertools

data_path = "data"
repo_paths = {
    "chromium": "C:/depot_tools/chromium/src",
    "v8": "C:/depot_tools/v8/v8",
}

"""
This code is pretty hastily thrown together, it's more of a prototype than anything. Run this first and then run gen_html.py.
I realized later on you can actually skip all of the git stuff and just get the affected files from the Git Watcher comment :p.
"""

def load_issues():
    with open(f"{data_path}/issues.json", "r") as f:
        issues = json.load(f)
        return {issue["localId"]: issue for issue in issues}


def load_comments():
    comments = {}
    commits = {}
    comments_files = [f"{data_path}/comments/" + fn for fn in os.listdir(f"{data_path}/comments")]
    for i, filename in enumerate(comments_files):
        print(f"{i + 1}/{len(comments_files)}")
        with open(filename, "r") as f:
            issue_id = int(filename.split(".")[-2].split("/")[-1])
            comments[issue_id] = json.load(f)
            commits[issue_id] = []
            for comment in comments[issue_id]:
                refer_match = re.search(r"The following revision refers to this bug:\n"
                                        r"..https://chromium.googlesource.com/([^ \n]+)",
                                        comment.get("content", ""),
                                        re.MULTILINE)
                if refer_match:
                    commits[issue_id].append(refer_match.group(1))
    return comments, commits


def get_repo(path):
    repo = git.Repo(path)
    if repo.bare:
        raise FileNotFoundError("Git repo is bare")
    return repo


def get_diffs(repo, commit):
    return repo.commit(f"{commit}~").diff(commit)


def get_diff_files(diffs, prefix=""):
    return [prefix + (diff.b_rawpath or diff.a_rawpath).decode("utf-8") for diff in diffs]


def get_issue_reward(issue):
    for label in issue["labelRefs"]:
        reward_tag = re.search(r"^reward-(\d+)$", label["label"], re.IGNORECASE)
        if reward_tag:
            return int(reward_tag.group(1))
    return 0


def get_issue_cve(issue):
    for label in issue["labelRefs"]:
        cve_tag = re.search(r"^(CVE-\d+-\d+)$", label["label"], re.IGNORECASE)
        if cve_tag:
            return cve_tag.group(1)
    return None


def print_issue_summary(issue):
    print(f'[{issue["reporterRef"]["displayName"]}]',
          f'${get_issue_reward(issue)}',
          get_issue_cve(issue),
          f'{issue["localId"]} - {issue["summary"]}')


def main():
    file_data = {}
    file_data_cache = f"{data_path}/file_data_cache.json"
    if os.path.exists(file_data_cache):
        with open(file_data_cache, "r") as f:
            file_data = json.load(f)
    else:
        issues = load_issues()
        comments, commits = load_comments()
        repos = {name: get_repo(path) for name, path in repo_paths.items()}

        for i, (issue_id, issue) in enumerate(issues.items()):
            print(f"{i}/{len(issues)}")
            # if issue["reporterRef"]["userId"] != "2229699947":
            #     continue
            issue_reward = get_issue_reward(issue)
            if issue_reward == 0:
                continue
            print_issue_summary(issue)
            try:
                commit_files = [get_diff_files(
                                    get_diffs(repos[commit.split("/")[0]], commit.split("/")[-1]),
                                    prefix=commit.split("/")[0] + "!",
                                )
                                for commit in commits[issue_id]
                                if commit.split("/")[0] in repos]
            except ValueError as e:
                print(f"Error while looking up commit_files for {issue_id}: ", e)
                continue
            issue_files = list(itertools.chain.from_iterable(commit_files))
            if len(issue_files) == 0:
                print(f"Couldn't figure out the files for issue {issue_id} with commits {commits[issue_id]}")
                continue
            per_file_value = issue_reward / len(issue_files)
            for file in issue_files:
                if file not in file_data:
                    file_data[file] = {
                        "value": 0,
                        "issues": [],
                    }
                file_data[file]["value"] += per_file_value
                file_data[file]["issues"].append({
                    "issue_id": issue_id,
                    "per_file_value": per_file_value,
                    "issue_reward": issue_reward,
                    "reporter_name": issue["reporterRef"]["displayName"],
                    "reporter_id": issue["reporterRef"]["userId"],
                })
        with open(file_data_cache, "x") as f:
            json.dump(file_data, f)
    for k, v in sorted(file_data.items(), key=lambda item: (item[1]["value"], item[0])):
        print(f'[${round(v["value"])}] {k}')


if __name__ == '__main__':
    main()

## gen_html.py
import json
import html

data_path = "data"
file_data_cache = f"{data_path}/file_data_cache.json"

# Run gen_data.py first

with open(file_data_cache, "r") as f:
    file_data = json.load(f)

dirtree = {}

for filename, data in file_data.items():
    file_broken = filename.replace("!", "/").split("/")
    current_obj = dirtree
    for folder in file_broken[:-1]:
        if folder not in current_obj:
            current_obj[folder] = {}
        current_obj = current_obj[folder]
    if file_broken[-1] not in current_obj:
        current_obj[file_broken[-1]] = {
            "value": 0,
            "related_issues": [],
        }
    current_obj[file_broken[-1]]["value"] += data["value"]
    current_obj[file_broken[-1]]["related_issues"] += list(set([str(int(issue["issue_id"])) for issue in data["issues"]]))


def recursively_html(name, obj):
    if "related_issues" in obj:
        return f"<p><a href=\"#!\" onclick=\"sri(this, [{','.join(obj['related_issues'])}])\">{html.escape(name)}</a>" \
               f" - <span class=\"money\">${obj['value']:,.0f}</span> <span class=\"issues\">{len(obj['related_issues'])}</span></p>", obj['value'], len(obj['related_issues'])
    recursive_data = sorted([recursively_html(k, v) for k, v in obj.items()], key=lambda x: -x[1])
    total_value = sum(d[1] for d in recursive_data)
    total_issues = sum(d[2] for d in recursive_data)
    return f"<details><summary>{html.escape(name)} - <span class=\"money\">${total_value:,.0f}</span> " \
           f"<span class=\"issues\">{total_issues}</span></summary>" \
           f"{''.join(d[0] for d in recursive_data)}</details>", total_value, total_issues


def clean_json_data(json_data):
    json_data = json.loads(json_data)
    json_data = [{
        "localId": data["localId"],
        "labelRefs": list([label for label in data["labelRefs"] if "reward" in label["label"].lower()]),
        "summary": data["summary"],
    } for data in json_data]
    return json.dumps(json_data)


with open(f"{data_path}/repo_browse.html", "w") as f:
    with open("gen_html_base.html", "r") as base:
        with open(f"{data_path}/issues.json", "r") as json_data:
            f.write(base.read()
                    .replace("HTML_SLOT", ''.join(d[0] for d in recursively_html("root", dirtree)[0]))
                    .replace("JSON_DATA", clean_json_data(json_data.read()).replace("</", "<\\/"))
                    )

## gen_html_base.html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Money Tree</title>
    <style>
        body, html {
            font-family: sans-serif;
        }
        .money {
            color: green;
        }
        .issues {
            color: red;
        }
        details {
            margin: 5px 5px 5px 12px;
            border: 1px solid #aaa;
            border-radius: 4px;
            padding: 4px;
        }
        summary {
            cursor: pointer;
        }
        summary:hover {
            background: #EFF;
        }
        p {
            margin-top: 2px;
            margin-bottom: 2px;
            margin-left: 30px;
        }
        #hover {
            position: fixed;
            border: 1px solid #aaa;
            border-radius: 4px;
            background: beige;
            display: none;
        }
        a[href="#!"]:visited {
            color:blue;
        }
    </style>
</head>
<body>
<h1>Chromium Money Tree Browser</h1>
<p>This site maps Chrome VRP (bug bounty) rewards to changes (fixes) in specific files</p>
<p>It's very very hacked together, don't expect good UX or accurate data 😅</p>
<p>The bug bounty reward gets divided between files, eg if a fix to a $1000 bug changed 5 files, each file gets $200</p>
<p>Data is up to the beginning of November 2023</p>
<p><a href="https://twitter.com/rebane2001/status/1727066895327867322">Discuss on Twitter</a> :P</p>
HTML_SLOT
<div id="hover"></div>
<script>
    const hover = document.getElementById("hover");
    const escapeHtml = (unsafe) => {
        return unsafe.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;').replaceAll('"', '&quot;').replaceAll("'", '&#039;');
    }
    const json_data = JSON_DATA;
    function showRelatedIssues(el, issues) {
        const lines = issues.map(issueId => json_data.find(({localId}) => localId === issueId)).map(issue => {
            const reward = issue["labelRefs"].map(e=>e.label).find(e=>/^reward-(\d+)$/.test(e.toLowerCase())).split("-")[1];
            return `[$${reward}] <a href="https://bugs.chromium.org/p/chromium/issues/detail?id=${issue["localId"]}">${escapeHtml(issue["summary"])}</a>`;
        });
        hover.innerHTML = lines.join("<br>");
        const bounds = el.getBoundingClientRect();
        hover.style.top = bounds.bottom + "px";
        // hover.style.left = bounds.left + "px";
        hover.style.left = "8px";
        hover.style.display = "block";
        // To prevent default behavior
        return false;
    }
    const sri = showRelatedIssues;
    hover.onmouseleave = () => {
      hover.style.display = "none";
    };
</script>
</body>
</html>
	import json
	import os
	import re
	import git
	import itertools

	data_path = "data"
	repo_paths = {
	"chromium": "C:/depot_tools/chromium/src",
	"v8": "C:/depot_tools/v8/v8",
	}

	"""
	This code is pretty hastily thrown together, it's more of a prototype than anything. Run this first and then run gen_html.py.
	I realized later on you can actually skip all of the git stuff and just get the affected files from the Git Watcher comment :p.
	"""

	def load_issues():
	with open(f"{data_path}/issues.json", "r") as f:
	issues = json.load(f)
	return {issue["localId"]: issue for issue in issues}


	def load_comments():
	comments = {}
	commits = {}
	comments_files = [f"{data_path}/comments/" + fn for fn in os.listdir(f"{data_path}/comments")]
	for i, filename in enumerate(comments_files):
	print(f"{i + 1}/{len(comments_files)}")
	with open(filename, "r") as f:
	issue_id = int(filename.split(".")[-2].split("/")[-1])
	comments[issue_id] = json.load(f)
	commits[issue_id] = []
	for comment in comments[issue_id]:
	refer_match = re.search(r"The following revision refers to this bug:\n"
	r"..https://chromium.googlesource.com/([^ \n]+)",
	comment.get("content", ""),
	re.MULTILINE)
	if refer_match:
	commits[issue_id].append(refer_match.group(1))
	return comments, commits


	def get_repo(path):
	repo = git.Repo(path)
	if repo.bare:
	raise FileNotFoundError("Git repo is bare")
	return repo


	def get_diffs(repo, commit):
	return repo.commit(f"{commit}~").diff(commit)


	def get_diff_files(diffs, prefix=""):
	return [prefix + (diff.b_rawpath or diff.a_rawpath).decode("utf-8") for diff in diffs]


	def get_issue_reward(issue):
	for label in issue["labelRefs"]:
	reward_tag = re.search(r"^reward-(\d+)$", label["label"], re.IGNORECASE)
	if reward_tag:
	return int(reward_tag.group(1))
	return 0


	def get_issue_cve(issue):
	for label in issue["labelRefs"]:
	cve_tag = re.search(r"^(CVE-\d+-\d+)$", label["label"], re.IGNORECASE)
	if cve_tag:
	return cve_tag.group(1)
	return None


	def print_issue_summary(issue):
	print(f'[{issue["reporterRef"]["displayName"]}]',
	f'${get_issue_reward(issue)}',
	get_issue_cve(issue),
	f'{issue["localId"]} - {issue["summary"]}')


	def main():
	file_data = {}
	file_data_cache = f"{data_path}/file_data_cache.json"
	if os.path.exists(file_data_cache):
	with open(file_data_cache, "r") as f:
	file_data = json.load(f)
	else:
	issues = load_issues()
	comments, commits = load_comments()
	repos = {name: get_repo(path) for name, path in repo_paths.items()}

	for i, (issue_id, issue) in enumerate(issues.items()):
	print(f"{i}/{len(issues)}")
	# if issue["reporterRef"]["userId"] != "2229699947":
	# continue
	issue_reward = get_issue_reward(issue)
	if issue_reward == 0:
	continue
	print_issue_summary(issue)
	try:
	commit_files = [get_diff_files(
	get_diffs(repos[commit.split("/")[0]], commit.split("/")[-1]),
	prefix=commit.split("/")[0] + "!",
	)
	for commit in commits[issue_id]
	if commit.split("/")[0] in repos]
	except ValueError as e:
	print(f"Error while looking up commit_files for {issue_id}: ", e)
	continue
	issue_files = list(itertools.chain.from_iterable(commit_files))
	if len(issue_files) == 0:
	print(f"Couldn't figure out the files for issue {issue_id} with commits {commits[issue_id]}")
	continue
	per_file_value = issue_reward / len(issue_files)
	for file in issue_files:
	if file not in file_data:
	file_data[file] = {
	"value": 0,
	"issues": [],
	}
	file_data[file]["value"] += per_file_value
	file_data[file]["issues"].append({
	"issue_id": issue_id,
	"per_file_value": per_file_value,
	"issue_reward": issue_reward,
	"reporter_name": issue["reporterRef"]["displayName"],
	"reporter_id": issue["reporterRef"]["userId"],
	})
	with open(file_data_cache, "x") as f:
	json.dump(file_data, f)
	for k, v in sorted(file_data.items(), key=lambda item: (item[1]["value"], item[0])):
	print(f'[${round(v["value"])}] {k}')


	if __name__ == '__main__':
	main()
	import json
	import html

	data_path = "data"
	file_data_cache = f"{data_path}/file_data_cache.json"

	# Run gen_data.py first

	with open(file_data_cache, "r") as f:
	file_data = json.load(f)

	dirtree = {}

	for filename, data in file_data.items():
	file_broken = filename.replace("!", "/").split("/")
	current_obj = dirtree
	for folder in file_broken[:-1]:
	if folder not in current_obj:
	current_obj[folder] = {}
	current_obj = current_obj[folder]
	if file_broken[-1] not in current_obj:
	current_obj[file_broken[-1]] = {
	"value": 0,
	"related_issues": [],
	}
	current_obj[file_broken[-1]]["value"] += data["value"]
	current_obj[file_broken[-1]]["related_issues"] += list(set([str(int(issue["issue_id"])) for issue in data["issues"]]))


	def recursively_html(name, obj):
	if "related_issues" in obj:
	return f"<p><a href=\"#!\" onclick=\"sri(this, [{','.join(obj['related_issues'])}])\">{html.escape(name)}</a>" \
	f" - <span class=\"money\">${obj['value']:,.0f}</span> <span class=\"issues\">{len(obj['related_issues'])}</span></p>", obj['value'], len(obj['related_issues'])
	recursive_data = sorted([recursively_html(k, v) for k, v in obj.items()], key=lambda x: -x[1])
	total_value = sum(d[1] for d in recursive_data)
	total_issues = sum(d[2] for d in recursive_data)
	return f"<details><summary>{html.escape(name)} - <span class=\"money\">${total_value:,.0f}</span> " \
	f"<span class=\"issues\">{total_issues}</span></summary>" \
	f"{''.join(d[0] for d in recursive_data)}</details>", total_value, total_issues


	def clean_json_data(json_data):
	json_data = json.loads(json_data)
	json_data = [{
	"localId": data["localId"],
	"labelRefs": list([label for label in data["labelRefs"] if "reward" in label["label"].lower()]),
	"summary": data["summary"],
	} for data in json_data]
	return json.dumps(json_data)


	with open(f"{data_path}/repo_browse.html", "w") as f:
	with open("gen_html_base.html", "r") as base:
	with open(f"{data_path}/issues.json", "r") as json_data:
	f.write(base.read()
	.replace("HTML_SLOT", ''.join(d[0] for d in recursively_html("root", dirtree)[0]))
	.replace("JSON_DATA", clean_json_data(json_data.read()).replace("</", "<\\/"))
	)
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<title>Money Tree</title>
	<style>
	body, html {
	font-family: sans-serif;
	}
	.money {
	color: green;
	}
	.issues {
	color: red;
	}
	details {
	margin: 5px 5px 5px 12px;
	border: 1px solid #aaa;
	border-radius: 4px;
	padding: 4px;
	}
	summary {
	cursor: pointer;
	}
	summary:hover {
	background: #EFF;
	}
	p {
	margin-top: 2px;
	margin-bottom: 2px;
	margin-left: 30px;
	}
	#hover {
	position: fixed;
	border: 1px solid #aaa;
	border-radius: 4px;
	background: beige;
	display: none;
	}
	a[href="#!"]:visited {
	color:blue;
	}
	</style>
	</head>
	<body>
	<h1>Chromium Money Tree Browser</h1>
	<p>This site maps Chrome VRP (bug bounty) rewards to changes (fixes) in specific files</p>
	<p>It's very very hacked together, don't expect good UX or accurate data 😅</p>
	<p>The bug bounty reward gets divided between files, eg if a fix to a $1000 bug changed 5 files, each file gets $200</p>
	<p>Data is up to the beginning of November 2023</p>
	<p><a href="https://twitter.com/rebane2001/status/1727066895327867322">Discuss on Twitter</a> :P</p>
	HTML_SLOT
	<div id="hover"></div>
	<script>
	const hover = document.getElementById("hover");
	const escapeHtml = (unsafe) => {
	return unsafe.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>').replaceAll('"', '"').replaceAll("'", ''');
	}
	const json_data = JSON_DATA;
	function showRelatedIssues(el, issues) {
	const lines = issues.map(issueId => json_data.find(({localId}) => localId === issueId)).map(issue => {
	const reward = issue["labelRefs"].map(e=>e.label).find(e=>/^reward-(\d+)$/.test(e.toLowerCase())).split("-")[1];
	return `[$${reward}] <a href="https://bugs.chromium.org/p/chromium/issues/detail?id=${issue["localId"]}">${escapeHtml(issue["summary"])}</a>`;
	});
	hover.innerHTML = lines.join("<br>");
	const bounds = el.getBoundingClientRect();
	hover.style.top = bounds.bottom + "px";
	// hover.style.left = bounds.left + "px";
	hover.style.left = "8px";
	hover.style.display = "block";
	// To prevent default behavior
	return false;
	}
	const sri = showRelatedIssues;
	hover.onmouseleave = () => {
	hover.style.display = "none";
	};
	</script>
	</body>
	</html>