Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TimMensch/86064e34d8c901dbb5c3 to your computer and use it in GitHub Desktop.
Save TimMensch/86064e34d8c901dbb5c3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- mode: python -*-
from __future__ import print_function, with_statement
import sys
import hashlib
import tempfile
import os
import subprocess
import shlex
import shutil
import itertools
import threading
import time
import collections
import posixpath
BLOCK_SIZE = 4096
def pathjoin (path, *paths):
return posixpath.join(path,*paths)
def make_string (s):
if isinstance(s, bytes):
return s.decode("utf-8")
else:
return s
def make_buffer (s):
if isinstance(s, bytes):
return s
else:
return s.encode("utf-8")
def verbose_stderr(*args, **kwargs):
return print(*map(make_string,args), file=sys.stderr, **kwargs)
def verbose_ignore(*args, **kwargs):
pass
def ensure_binary_mode(stream):
try: # Attempt the Python-3 way, also needed to handle unicode
return stream.detach()
except:
pass
if sys.platform == "win32":
# Fall back to Python-2 way, only needed on Windows
import msvcrt
msvcrt.setmode(stream.fileno(), os.O_BINARY)
return stream
def mkdir_p(path):
import errno
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
def readblocks(stream):
bytecount = 0
while True:
data = stream.read(BLOCK_SIZE)
bytecount += len(data)
if not data:
break
yield data
def cat_iter(initer, outstream):
for block in initer:
outstream.write(block)
def cat(instream, outstream):
return cat_iter(readblocks(instream), outstream)
def difftreez_reader(input):
"""Incremental reader for git diff-tree -z output
:oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ...
"""
buffer = []
partial = b''
while True:
newread = input.read(BLOCK_SIZE)
if not newread:
break
partial += newread
while True:
head, sep, partial = partial.partition(b'\0')
if not sep:
partial = head
break
buffer.append(head)
if len(buffer) == 2:
oldmode, newmode, oldhash, newhash, modflag = buffer[0].split()
path = buffer[1]
yield (newhash, modflag, path)
buffer = []
def gitconfig_get(name, file=None):
args = ['git', 'config', '--get']
if file is not None:
args += ['--file', make_string(file)]
args.append(make_string(name))
p = subprocess.Popen(args, stdout=subprocess.PIPE)
output = p.communicate()[0].strip()
if p.returncode != 0:
return None
else:
return output
def gitconfig_set(name, value, file=None):
args = ['git', 'config']
if file is not None:
args += ['--file', file]
args += [name, value]
p = subprocess.check_call(args)
class GitFat(object):
DecodeError = RuntimeError
def __init__(self):
self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore
self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip()
self.objdir = pathjoin(self.gitdir, b'fat', b'objects')
if os.environ.get('GIT_FAT_VERSION') == '1':
self.encode = self.encode_v1
else:
self.encode = self.encode_v2
def magiclen(enc):
return len(enc(hashlib.sha1(b'dummy').hexdigest().encode('ASCII'), 5))
self.magiclen = magiclen(self.encode) # Current version
self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions
def setup(self):
mkdir_p(self.objdir)
def get_rsync(self):
cfgpath = pathjoin(self.gitroot, b'.gitfat')
remote = gitconfig_get('rsync.remote', file=cfgpath)
ssh_port = gitconfig_get('rsync.sshport', file=cfgpath)
ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath)
if remote is None:
raise RuntimeError('No rsync.remote in %s' % cfgpath)
return remote, ssh_port, ssh_user
def get_rsync_command(self,push):
(remote, ssh_port, ssh_user) = self.get_rsync()
if push:
self.verbose('Pushing to:', remote)
else:
self.verbose('Pulling from:', remote)
cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-']
rshopts = ''
if ssh_user:
rshopts += ' -l ' + make_string(ssh_user)
if ssh_port:
rshopts += ' -p ' + make_string(ssh_port)
if rshopts:
cmd.append('--rsh=plink' + rshopts)
if push:
cmd += [make_string(self.objdir) + '/', make_string(remote) + '/']
else:
cmd += [make_string(remote) + '/', make_string(self.objdir) + '/']
return cmd
def revparse(self, revname):
return subprocess.check_output(['git', 'rev-parse', revname]).strip()
def encode_v1(self, digest, bytecount):
'Produce legacy representation of file to be stored in repository.'
return (b'#$# git-fat ' + digest + b'\n')
def encode_v2(self, digest, bytecount):
'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.'
return (b'#$# git-fat ' + digest + (' %20d\n' % (bytecount,)).encode('ASCII'))
def decode(self, bstring, noraise=False):
cookie = b'#$# git-fat '
if bstring.startswith(cookie):
parts = bstring[len(cookie):].split()
digest = parts[0]
bytecount = int(parts[1]) if len(parts) > 1 else None
return digest, int(bytecount)
elif noraise:
return None, None
else:
raise GitFat.DecodeError('Could not decode %s' % repr(bstring))
def decode_stream(self, stream):
'Return digest if git-fat cache, otherwise return iterator over entire file contents'
preamble = stream.read(self.magiclen)
try:
return self.decode(preamble)
except GitFat.DecodeError:
# Not sure if this is the right behavior
return itertools.chain([preamble], readblocks(stream)), None
def decode_file(self, fname):
# Fast check
stat = os.stat(fname)
if stat.st_size != self.magiclen:
return False, None
# read file
digest, bytecount = self.decode_stream(open(fname, 'rb'))
if isinstance(digest, bytes):
return digest, bytecount
else:
return None, bytecount
def decode_clean(self, body):
'''
Attempt to decode version in working tree. The tree version could be changed to have a more
useful message than the machine-readable copy that goes into the repository. If the tree
version decodes successfully, it indicates that the fat data is not currently available in
this repository.
'''
digest, bytecount = self.decode(body, noraise=True)
return digest
def filter_clean(self, instream, outstreamclean):
h = hashlib.new('sha1')
bytecount = 0
# mkstemp requires 'str' rather than native filesystem bytes
fd, tmpname = tempfile.mkstemp(dir=self.objdir.decode(sys.getfilesystemencoding()))
try:
ishanging = False
cached = False # changes to True when file is cached
with os.fdopen(fd, 'wb') as cache:
outstream = cache
blockiter = readblocks(instream)
firstblock = True
for block in readblocks(instream):
if firstblock:
if len(block) == self.magiclen and self.decode_clean(block[0:self.magiclen]):
ishanging = True # Working tree version is verbatim from repository (not smudged)
outstream = outstreamclean
firstblock = False
h.update(block)
bytecount += len(block)
outstream.write(block)
outstream.flush()
digest = h.hexdigest().encode('ASCII')
objfile = pathjoin(self.objdir, digest)
if not ishanging:
if os.path.exists(objfile):
self.verbose('git-fat filter-clean: cache already exists:', objfile)
os.remove(tmpname)
else:
os.rename(make_string(tmpname), make_string(objfile))
self.verbose('git-fat filter-clean: caching to:', objfile)
cached = True
outstreamclean.write(self.encode(digest, bytecount))
finally:
if not cached:
os.remove(tmpname)
def cmd_filter_clean(self):
'''
The clean filter runs when a file is added to the index. It gets the "smudged" (tree)
version of the file on stdin and produces the "clean" (repository) version on stdout.
'''
self.setup()
# Set stdin and stdout to binary mode
sys.stdin = ensure_binary_mode(sys.stdin)
sys.stdout = ensure_binary_mode(sys.stdout)
self.filter_clean(sys.stdin, sys.stdout)
def cmd_filter_smudge(self):
self.setup()
# Ensure streams are treated as binary
sys.stdin = ensure_binary_mode(sys.stdin)
sys.stdout = ensure_binary_mode(sys.stdout)
result, bytecount = self.decode_stream(sys.stdin)
if isinstance(result, bytes): # We got a digest
objfile = pathjoin(self.objdir, result)
try:
self.verbose('git-fat filter-smudge: restoring from:', objfile)
cat(open(objfile, 'rb'), sys.stdout)
except IOError: # file not found
self.verbose('git-fat filter-smudge: fat object missing:', objfile)
sys.stdout.write(self.encode(result, bytecount)) # could leave a better notice about how to recover this file
else: # We have an iterable over the original input.
self.verbose('git-fat filter-smudge: not a managed file')
cat_iter(result, sys.stdout)
def catalog_objects(self):
return set(os.listdir(self.objdir))
def referenced_objects(self, rev=None, all=False):
referenced = set()
if all:
rev = '--all'
elif rev is None:
rev = self.revparse('HEAD')
rev = make_string(rev)
p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def cut_sha1hash(input, output):
for line in input:
output.write(line.split()[0] + b'\n')
output.close()
cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
cut_thread.start()
for line in p2.stdout:
objhash, objtype, size = line.split()
if objtype == b'blob' and int(size) in self.magiclens:
try:
objhash = make_string(objhash)
fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0]
referenced.add(fathash)
except GitFat.DecodeError:
pass
cut_thread.join()
p1.wait()
p2.wait()
return referenced
def orphan_files(self):
'generator for all orphan placeholders in the working tree'
for fname in subprocess.check_output(['git', 'ls-files']).splitlines():
digest = self.decode_file(fname)[0]
if digest:
yield (digest, fname)
def cmd_status(self, args):
self.setup()
catalog = self.catalog_objects()
refargs = dict()
if '--all' in args:
refargs['all'] = True
referenced = self.referenced_objects(**refargs)
garbage = catalog - referenced
orphans = referenced - catalog
if '--all' in args:
for obj in referenced:
print(obj)
if orphans:
print('Orphan objects:')
for orph in orphans:
print(' ' + orph)
if garbage:
print('Garbage objects:')
for g in garbage:
print(' ' + g)
def is_dirty(self):
return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0
def cmd_push(self, args):
'Push anything that I have stored and referenced'
self.setup()
# Default to push only those objects referenced by current HEAD
# (includes history). Finer-grained pushing would be useful.
pushall = '--all' in args
files = self.referenced_objects(all=pushall) & self.catalog_objects()
cmd = self.get_rsync_command(push=True)
self.verbose('Executing:', ' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input=b'\x00'.join(files))
def checkout(self, show_orphans=False):
'Update any stale files in the present working tree'
for digest, fname in self.orphan_files():
objpath = pathjoin(self.objdir, digest)
if os.access(objpath, os.R_OK):
print((b'Restoring ' + digest + b' -> ' + fname).decode(sys.getfilesystemencoding()))
# The output of our smudge filter depends on the existence of
# the file in .git/fat/objects, but git caches the file stat
# from the previous time the file was smudged, therefore it
# won't try to re-smudge. I don't know a git command that
# specifically invalidates that cache, but touching the file
# also does the trick.
os.utime(fname, None)
# This re-smudge is essentially a copy that restores permissions.
subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname])
elif show_orphans:
print('Data unavailable: %s %s' % (digest,fname))
def cmd_pull(self, args):
'Pull anything that I have referenced, but not stored'
self.setup()
refargs = dict()
if '--all' in args:
refargs['all'] = True
for arg in args:
if arg.startswith('-') or len(arg) != 40:
continue
rev = self.revparse(arg)
if rev:
refargs['rev'] = rev
files = self.referenced_objects(**refargs) - self.catalog_objects()
cmd = self.get_rsync_command(push=False)
self.verbose('Executing:', b' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input=(b'\x00'.join(files)))
self.checkout()
def cmd_checkout(self, args):
self.checkout(show_orphans=True)
def cmd_gc(self):
garbage = self.catalog_objects() - self.referenced_objects()
print('Unreferenced objects to remove: %d' % len(garbage))
for obj in garbage:
fname = pathjoin(self.objdir, obj)
print('%10d %s' % (os.stat(fname).st_size, obj))
os.remove(fname)
def cmd_init(self):
self.setup()
if gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge'):
print('Git fat already configured, check configuration in .git/config')
else:
gitconfig_set('filter.fat.clean', 'git-fat filter-clean')
gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge')
print('Initialized git fat')
def gen_large_blobs(self, revs, threshsize):
"""Build dict of all blobs"""
time0 = time.time()
def hash_only(input, output):
"""The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags.
This truncates to one hash per line.
"""
for line in input:
output.write(line[:40] + b'\n')
output.close()
revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1)
objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin))
hashonly.start()
numblobs = 0; numlarge = 0
# Build dict with the sizes of all large blobs
for line in objcheck.stdout:
objhash, blob, size = line.split()
if blob != b'blob':
continue
size = int(size)
numblobs += 1
if size > threshsize:
numlarge += 1
yield objhash, size
revlist.wait()
objcheck.wait()
hashonly.join()
time1 = time.time()
self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
def cmd_find(self, args):
# FIXME: Need input validation here
maxsize = int(args[0])
blobsizes = dict(self.gen_large_blobs('--all', maxsize))
time0 = time.time()
# Find all names assumed by large blobs (those in blobsizes)
pathsizes = collections.defaultdict(lambda:set())
revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1)
difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'],
stdin=revlist.stdout, stdout=subprocess.PIPE)
for newblob, modflag, path in difftreez_reader(difftree.stdout):
bsize = blobsizes.get(newblob)
path = path.decode(sys.getfilesystemencoding())
if bsize: # We care about this blob
pathsizes[path].add(bsize)
time1 = time.time()
self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0))
maxlen = max(map(len,pathsizes)) if pathsizes else 0
for path, sizes in sorted(pathsizes.items(), key=lambda item: max(item[1]), reverse=True):
print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
revlist.wait()
difftree.wait()
def cmd_index_filter(self, args):
# FIXME: Need input validation here
manage_gitattributes = '--manage-gitattributes' in args
filelist = set(f.strip() for f in open(args[0], 'rb').readlines())
lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE)
updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE)
for line in lsfiles.stdout:
mode, sep, tail = line.partition(b' ')
blobhash, sep, tail = tail.partition(b' ')
stageno, sep, tail = tail.partition(b'\t')
filename = tail.strip()
if filename not in filelist:
continue
# This file will contain the hash of the cleaned object
hashfile = pathjoin(self.gitdir, b'fat', b'index-filter', blobhash)
try:
cleanedobj = open(hashfile, 'wb').read().rstrip()
except IOError:
catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE)
hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def dofilter():
self.filter_clean(catfile.stdout, hashobject.stdin)
hashobject.stdin.close()
filterclean = threading.Thread(target=dofilter)
filterclean.start()
cleanedobj = hashobject.stdout.read().rstrip()
catfile.wait()
hashobject.wait()
filterclean.join()
mkdir_p(os.path.dirname(hashfile))
open(hashfile, 'wb').write(cleanedobj + b'\n')
updateindex.stdin.write(mode + b' ' + cleanedobj + b' ' + stageno + b'\t' + filename + b'\n')
if manage_gitattributes:
try:
mode, blobsha1, stageno, filename = subprocess.check_output(['git', 'ls-files', '-s', '.gitattributes']).split()
gitattributes_lines = subprocess.check_output(['git', 'cat-file', 'blob', blobsha1]).splitlines()
except ValueError: # Nothing to unpack, thus no file
mode, stageno = b'100644', b'0'
gitattributes_lines = []
gitattributes_extra = [line.split()[0] + b' filter=fat -text' for line in filelist]
hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, stderr = hashobject.communicate(b'\n'.join(gitattributes_lines + gitattributes_extra) + b'\n')
updateindex.stdin.write(mode + b' ' + stdout.strip() + b' ' + stageno + b'\t.gitattributes\n')
updateindex.stdin.close()
lsfiles.wait()
updateindex.wait()
if __name__ == '__main__':
fat = GitFat()
cmd = sys.argv[1] if len(sys.argv) > 1 else ''
if cmd == 'filter-clean':
fat.cmd_filter_clean()
elif cmd == 'filter-smudge':
fat.cmd_filter_smudge()
elif cmd == 'init':
fat.cmd_init()
elif cmd == 'status':
fat.cmd_status(sys.argv[2:])
elif cmd == 'push':
fat.cmd_push(sys.argv[2:])
elif cmd == 'pull':
fat.cmd_pull(sys.argv[2:])
elif cmd == 'gc':
fat.cmd_gc()
elif cmd == 'checkout':
fat.cmd_checkout(sys.argv[2:])
elif cmd == 'find':
fat.cmd_find(sys.argv[2:])
elif cmd == 'index-filter':
fat.cmd_index_filter(sys.argv[2:])
else:
print('Usage: git fat [init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment