Skip to content

Instantly share code, notes, and snippets.

@asford
Created June 13, 2021 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save asford/4966a816d40d5e0579edbf4e0b9e410f to your computer and use it in GitHub Desktop.
Save asford/4966a816d40d5e0579edbf4e0b9e410f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import io\n",
"import logging\n",
"import subprocess\n",
"from contextlib import suppress\n",
"from typing import Iterable, Tuple"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"def _get_revision_tree_ids(commit_shas: Iterable[str]) -> Iterable[Tuple[str, str]]:\n",
" \"\"\"Given a list of commit shas, read tree ids via cat-file.\n",
"\n",
" Yields (commit_sha, tree_sha) tuples for all commits successfully resolved to a tree object.\n",
" \"\"\"\n",
"\n",
" # cat-file --batch print object blobs in a record-oriented streaming format\n",
" # See:\n",
" # https://git-scm.com/docs/git-cat-file#_batch_output\n",
" cat_result = subprocess.check_output(\n",
" [\"git\", \"cat-file\", \"--batch\"], input=\"\\n\".join(commit_shas).encode()\n",
" )\n",
"\n",
" reader = io.BytesIO(cat_result)\n",
" \n",
" # Stream through the cat-file results, yielding (commit_sha, tree_sha)\n",
" # when a valid commit object in found.\n",
" #\n",
" # In the happy path cat-file should emit for every sha, \n",
" # however there may be missing objects/symlinks/etc is more esoteric\n",
" # repository states.\n",
" # Just ignore these records.\n",
" \n",
" object_line = reader.readline().strip()\n",
" while object_line:\n",
" # We want objects of the form\n",
" # '<hash> commit <content_size>\\n<content>\\n'\n",
" object_info = object_line.split()\n",
"\n",
" # But there are alternative forms if the commit is missing, lives in a symlink, etc.\n",
" #\n",
" # However, if the read has any content the last entry will be an integer content size.\n",
" # First read any content that's present off the input stream.\n",
" #\n",
" # object_info may be a three tuple or two tuple. Eg:\n",
" # <object> <type> <size>\\n<content>\\m\n",
" # symlink <size>\\n<content>\\n\n",
" #\n",
" # If the read has no content it will be a two-tuple. Eg:\n",
" # <object> missing\\n\n",
" # <object> ambiguous\\n\n",
" # \n",
" # Just try to parse for the size...\n",
" size = None\n",
" content = None\n",
" with suppress(ValueError):\n",
" size = int(object_info[-1])\n",
"\n",
" if size is not None:\n",
" # Trailing newline on content\n",
" content = reader.read(size + 1)\n",
" assert content[-1:] == b\"\\n\"\n",
" content = content[:-1]\n",
"\n",
" # Now we've read through the object in the content stream and\n",
" # can safely continue if we don't find an object of interest\n",
" if object_info[1] != b\"commit\":\n",
" logging.warning(\"non-commit object reading tree hashes: %s\", object_info)\n",
" continue\n",
"\n",
" # commit objects are line-oriented records themselves,\n",
" # but always start with the tree reference of the form:\n",
" #\n",
" # tree <tree_sha>\\n\n",
" #\n",
" # https://git-scm.com/book/en/v2/Git-Internals-Git-Objects\n",
" if not content.startswith(b\"tree\"):\n",
" logging.warning(\"commit object didn't start with tree entry: %s\", object_info)\n",
"\n",
" tree, tree_sha, rest = content.split(maxsplit=2)\n",
"\n",
" yield (object_info[0].decode(), tree_sha.decode())\n",
"\n",
" # Read the next object record\n",
" object_line = reader.readline().strip()\n",
"\n",
"\n",
"def _get_matching_tree_commits_via_catfile(commit_sha: str, candidate_shas: str):\n",
" \"\"\"Identify all commits with tree contents matching commit_rev\"\"\"\n",
" ((_commit_rev, commit_tree),) = revision_tree_ids([commit_sha])\n",
"\n",
" matches = []\n",
"\n",
" for other_rev, other_tree in revision_tree_ids(candidate_shas):\n",
" if other_tree == commit_tree:\n",
" matches.append(other_rev)\n",
"\n",
" return matches\n",
"\n",
"\n",
"def matching_tree_commits_via_difftree(commit_sha: str, candidate_shas: str):\n",
" matches = []\n",
" for candidate_sha in candidate_shas:\n",
" check = subprocess.run(\n",
" [\"git\", \"diff-tree\", \"--quiet\", commit_sha, candidate_sha]\n",
" )\n",
"\n",
" if check.returncode == 0:\n",
" matches.append(candidate_sha)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rev_list= !git rev-list HEAD ^HEAD~100\n",
"len(rev_list)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"16.2 ms ± 530 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"matching_tree_commits_via_catfile(rev_list[0], rev_list)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"858 ms ± 38.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"matching_tree_commits_via_difftree(rev_list[0], rev_list)"
]
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "percent",
"format_version": "1.3",
"jupytext_version": "1.11.3"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment