TimMensch/git-fat modified to run (barely) on Python 3.4.1 on Windows

## git-fat modified to run (barely) on Python 3.4.1 on Windows
#!/usr/bin/env python
# -*- mode: python -*-

from __future__ import print_function, with_statement

import sys
import hashlib
import tempfile
import os
import subprocess
import shlex
import shutil
import itertools
import threading
import time
import collections
import posixpath

BLOCK_SIZE = 4096

def pathjoin (path, *paths):
    return posixpath.join(path,*paths)

def make_string (s):
    if isinstance(s, bytes):
        return s.decode("utf-8")
    else:
        return s

def make_buffer (s):
    if isinstance(s, bytes):
        return s
    else:
        return s.encode("utf-8")

def verbose_stderr(*args, **kwargs):
    return print(*map(make_string,args), file=sys.stderr, **kwargs)
def verbose_ignore(*args, **kwargs):
    pass

def ensure_binary_mode(stream):
    try:       # Attempt the Python-3 way, also needed to handle unicode
        return stream.detach()
    except:
        pass
    if sys.platform == "win32":
        # Fall back to Python-2 way, only needed on Windows
        import msvcrt
        msvcrt.setmode(stream.fileno(), os.O_BINARY)
    return stream

def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

def readblocks(stream):
    bytecount = 0
    while True:
        data = stream.read(BLOCK_SIZE)
        bytecount += len(data)
        if not data:
            break
        yield data
def cat_iter(initer, outstream):
    for block in initer:
        outstream.write(block)
def cat(instream, outstream):
    return cat_iter(readblocks(instream), outstream)
def difftreez_reader(input):
    """Incremental reader for git diff-tree -z output

    :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ...
    """
    buffer = []
    partial = b''
    while True:
        newread = input.read(BLOCK_SIZE)
        if not newread:
            break
        partial += newread
        while True:
            head, sep, partial = partial.partition(b'\0')
            if not sep:
                partial = head
                break
            buffer.append(head)
            if len(buffer) == 2:
                oldmode, newmode, oldhash, newhash, modflag = buffer[0].split()
                path = buffer[1]
                yield (newhash, modflag, path)
                buffer = []
def gitconfig_get(name, file=None):
    args = ['git', 'config', '--get']
    if file is not None:
        args += ['--file', make_string(file)]
    args.append(make_string(name))
    p = subprocess.Popen(args, stdout=subprocess.PIPE)
    output = p.communicate()[0].strip()
    if p.returncode != 0:
        return None
    else:
        return output
def gitconfig_set(name, value, file=None):
    args = ['git', 'config']
    if file is not None:
        args += ['--file', file]
    args += [name, value]
    p = subprocess.check_call(args)

class GitFat(object):
    DecodeError = RuntimeError
    def __init__(self):
        self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore
        self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
        self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip()
        self.objdir = pathjoin(self.gitdir, b'fat', b'objects')
        if os.environ.get('GIT_FAT_VERSION') == '1':
            self.encode = self.encode_v1
        else:
            self.encode = self.encode_v2
        def magiclen(enc):
            return len(enc(hashlib.sha1(b'dummy').hexdigest().encode('ASCII'), 5))
        self.magiclen = magiclen(self.encode) # Current version
        self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions
    def setup(self):
        mkdir_p(self.objdir)
    def get_rsync(self):
        cfgpath   = pathjoin(self.gitroot, b'.gitfat')
        remote    = gitconfig_get('rsync.remote', file=cfgpath)
        ssh_port  = gitconfig_get('rsync.sshport', file=cfgpath)
        ssh_user  = gitconfig_get('rsync.sshuser', file=cfgpath)
        if remote is None:
            raise RuntimeError('No rsync.remote in %s' % cfgpath)
        return remote, ssh_port, ssh_user
    def get_rsync_command(self,push):
        (remote, ssh_port, ssh_user) = self.get_rsync()
        if push:
            self.verbose('Pushing to:', remote)
        else:
            self.verbose('Pulling from:', remote)

        cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-']
        rshopts = ''
        if ssh_user:
            rshopts += ' -l ' + make_string(ssh_user)
        if ssh_port:
            rshopts += ' -p ' + make_string(ssh_port)
        if rshopts:
            cmd.append('--rsh=plink' + rshopts)
        if push:
            cmd += [make_string(self.objdir) + '/', make_string(remote) + '/']
        else:
            cmd += [make_string(remote) + '/', make_string(self.objdir) + '/']
        return cmd
    def revparse(self, revname):
        return subprocess.check_output(['git', 'rev-parse', revname]).strip()
    def encode_v1(self, digest, bytecount):
        'Produce legacy representation of file to be stored in repository.'
        return (b'#$# git-fat ' + digest + b'\n')
    def encode_v2(self, digest, bytecount):
        'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.'
        return (b'#$# git-fat ' + digest + (' %20d\n' % (bytecount,)).encode('ASCII'))
    def decode(self, bstring, noraise=False):
        cookie = b'#$# git-fat '
        if bstring.startswith(cookie):
            parts = bstring[len(cookie):].split()
            digest = parts[0]
            bytecount = int(parts[1]) if len(parts) > 1 else None
            return digest, int(bytecount)
        elif noraise:
            return None, None
        else:
            raise GitFat.DecodeError('Could not decode %s' % repr(bstring))
    def decode_stream(self, stream):
        'Return digest if git-fat cache, otherwise return iterator over entire file contents'
        preamble = stream.read(self.magiclen)
        try:
            return self.decode(preamble)
        except GitFat.DecodeError:
            # Not sure if this is the right behavior
            return itertools.chain([preamble], readblocks(stream)), None
    def decode_file(self, fname):
        # Fast check
        stat = os.stat(fname)
        if stat.st_size != self.magiclen:
            return False, None
        # read file
        digest, bytecount = self.decode_stream(open(fname, 'rb'))
        if isinstance(digest, bytes):
            return digest, bytecount
        else:
            return None, bytecount
    def decode_clean(self, body):
        '''
        Attempt to decode version in working tree. The tree version could be changed to have a more
        useful message than the machine-readable copy that goes into the repository. If the tree
        version decodes successfully, it indicates that the fat data is not currently available in
        this repository.
        '''
        digest, bytecount = self.decode(body, noraise=True)
        return digest
    def filter_clean(self, instream, outstreamclean):
        h = hashlib.new('sha1')
        bytecount = 0
        # mkstemp requires 'str' rather than native filesystem bytes
        fd, tmpname = tempfile.mkstemp(dir=self.objdir.decode(sys.getfilesystemencoding()))
        try:
            ishanging = False
            cached = False                # changes to True when file is cached
            with os.fdopen(fd, 'wb') as cache:
                outstream = cache
                blockiter = readblocks(instream)
                firstblock = True
                for block in readblocks(instream):
                    if firstblock:
                        if len(block) == self.magiclen and self.decode_clean(block[0:self.magiclen]):
                            ishanging = True              # Working tree version is verbatim from repository (not smudged)
                            outstream = outstreamclean
                        firstblock = False
                    h.update(block)
                    bytecount += len(block)
                    outstream.write(block)
                outstream.flush()
            digest = h.hexdigest().encode('ASCII')
            objfile = pathjoin(self.objdir, digest)
            if not ishanging:
                if os.path.exists(objfile):
                    self.verbose('git-fat filter-clean: cache already exists:', objfile)
                    os.remove(tmpname)
                else:
                    os.rename(make_string(tmpname), make_string(objfile))
                    self.verbose('git-fat filter-clean: caching to:', objfile)
                cached = True
                outstreamclean.write(self.encode(digest, bytecount))
        finally:
            if not cached:
                os.remove(tmpname)

    def cmd_filter_clean(self):
        '''
        The clean filter runs when a file is added to the index. It gets the "smudged" (tree)
        version of the file on stdin and produces the "clean" (repository) version on stdout.
        '''
        self.setup()
        # Set stdin and stdout to binary mode
        sys.stdin = ensure_binary_mode(sys.stdin)
        sys.stdout = ensure_binary_mode(sys.stdout)
        self.filter_clean(sys.stdin, sys.stdout)

    def cmd_filter_smudge(self):
        self.setup()
        # Ensure streams are treated as binary
        sys.stdin = ensure_binary_mode(sys.stdin)
        sys.stdout = ensure_binary_mode(sys.stdout)
        result, bytecount = self.decode_stream(sys.stdin)
        if isinstance(result, bytes): # We got a digest
            objfile = pathjoin(self.objdir, result)
            try:
                self.verbose('git-fat filter-smudge: restoring from:', objfile)
                cat(open(objfile, 'rb'), sys.stdout)
            except IOError:                # file not found
                self.verbose('git-fat filter-smudge: fat object missing:', objfile)
                sys.stdout.write(self.encode(result, bytecount))   # could leave a better notice about how to recover this file
        else:                              # We have an iterable over the original input.
            self.verbose('git-fat filter-smudge: not a managed file')
            cat_iter(result, sys.stdout)
    def catalog_objects(self):
        return set(os.listdir(self.objdir))
    def referenced_objects(self, rev=None, all=False):
        referenced = set()
        if all:
            rev = '--all'
        elif rev is None:
            rev = self.revparse('HEAD')

        rev = make_string(rev)

        p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
        p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        def cut_sha1hash(input, output):
            for line in input:
                output.write(line.split()[0] + b'\n')
            output.close()
        cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
        cut_thread.start()
        for line in p2.stdout:
            objhash, objtype, size = line.split()
            if objtype == b'blob' and int(size) in self.magiclens:
                try:
                    objhash = make_string(objhash)
                    fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0]
                    referenced.add(fathash)
                except GitFat.DecodeError:
                    pass
        cut_thread.join()
        p1.wait()
        p2.wait()
        return referenced
    def orphan_files(self):
        'generator for all orphan placeholders in the working tree'
        for fname in subprocess.check_output(['git', 'ls-files']).splitlines():
            digest = self.decode_file(fname)[0]
            if digest:
                yield (digest, fname)
    def cmd_status(self, args):
        self.setup()
        catalog = self.catalog_objects()
        refargs = dict()
        if '--all' in args:
            refargs['all'] = True
        referenced = self.referenced_objects(**refargs)
        garbage = catalog - referenced
        orphans = referenced - catalog
        if '--all' in args:
            for obj in referenced:
                print(obj)
        if orphans:
            print('Orphan objects:')
            for orph in orphans:
                print('    ' + orph)
        if garbage:
            print('Garbage objects:')
            for g in garbage:
                print('    ' + g)
    def is_dirty(self):
        return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0
    def cmd_push(self, args):
        'Push anything that I have stored and referenced'
        self.setup()
        # Default to push only those objects referenced by current HEAD
        # (includes history). Finer-grained pushing would be useful.
        pushall = '--all' in args
        files = self.referenced_objects(all=pushall) & self.catalog_objects()
        cmd = self.get_rsync_command(push=True)
        self.verbose('Executing:', ' '.join(cmd))
        p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
        p.communicate(input=b'\x00'.join(files))
    def checkout(self, show_orphans=False):
        'Update any stale files in the present working tree'
        for digest, fname in self.orphan_files():
            objpath = pathjoin(self.objdir, digest)
            if os.access(objpath, os.R_OK):
                print((b'Restoring ' + digest + b' -> ' + fname).decode(sys.getfilesystemencoding()))
                # The output of our smudge filter depends on the existence of
                # the file in .git/fat/objects, but git caches the file stat
                # from the previous time the file was smudged, therefore it
                # won't try to re-smudge. I don't know a git command that
                # specifically invalidates that cache, but touching the file
                # also does the trick.
                os.utime(fname, None)
                # This re-smudge is essentially a copy that restores permissions.
                subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname])
            elif show_orphans:
                print('Data unavailable: %s %s' % (digest,fname))
    def cmd_pull(self, args):
        'Pull anything that I have referenced, but not stored'
        self.setup()
        refargs = dict()
        if '--all' in args:
            refargs['all'] = True
        for arg in args:
            if arg.startswith('-') or len(arg) != 40:
                continue
            rev = self.revparse(arg)
            if rev:
                refargs['rev'] = rev
        files = self.referenced_objects(**refargs) - self.catalog_objects()
        cmd = self.get_rsync_command(push=False)
        self.verbose('Executing:', b' '.join(cmd))
        p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
        p.communicate(input=(b'\x00'.join(files)))
        self.checkout()
    def cmd_checkout(self, args):
        self.checkout(show_orphans=True)
    def cmd_gc(self):
        garbage = self.catalog_objects() - self.referenced_objects()
        print('Unreferenced objects to remove: %d' % len(garbage))
        for obj in garbage:
            fname = pathjoin(self.objdir, obj)
            print('%10d %s' % (os.stat(fname).st_size, obj))
            os.remove(fname)
    def cmd_init(self):
        self.setup()
        if gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge'):
            print('Git fat already configured, check configuration in .git/config')
        else:
            gitconfig_set('filter.fat.clean', 'git-fat filter-clean')
            gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge')
            print('Initialized git fat')
    def gen_large_blobs(self, revs, threshsize):
        """Build dict of all blobs"""
        time0 = time.time()
        def hash_only(input, output):
            """The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags.
            This truncates to one hash per line.
            """
            for line in input:
                output.write(line[:40] + b'\n')
            output.close()
        revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1)
        objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
        hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin))
        hashonly.start()
        numblobs = 0; numlarge = 0
        # Build dict with the sizes of all large blobs
        for line in objcheck.stdout:
            objhash, blob, size = line.split()
            if blob != b'blob':
                continue
            size = int(size)
            numblobs += 1
            if size > threshsize:
                numlarge += 1
                yield objhash, size
        revlist.wait()
        objcheck.wait()
        hashonly.join()
        time1 = time.time()
        self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
    def cmd_find(self, args):
        # FIXME: Need input validation here
        maxsize = int(args[0])
        blobsizes = dict(self.gen_large_blobs('--all', maxsize))
        time0 = time.time()
        # Find all names assumed by large blobs (those in blobsizes)
        pathsizes = collections.defaultdict(lambda:set())
        revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1)
        difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'],
                                    stdin=revlist.stdout, stdout=subprocess.PIPE)
        for newblob, modflag, path in difftreez_reader(difftree.stdout):
            bsize = blobsizes.get(newblob)
            path = path.decode(sys.getfilesystemencoding())
            if bsize:                     # We care about this blob
                pathsizes[path].add(bsize)
        time1 = time.time()
        self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0))
        maxlen = max(map(len,pathsizes)) if pathsizes else 0
        for path, sizes in sorted(pathsizes.items(), key=lambda item: max(item[1]), reverse=True):
            print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
        revlist.wait()
        difftree.wait()
    def cmd_index_filter(self, args):
        # FIXME: Need input validation here
        manage_gitattributes = '--manage-gitattributes' in args
        filelist = set(f.strip() for f in open(args[0], 'rb').readlines())
        lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE)
        updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE)
        for line in lsfiles.stdout:
            mode, sep, tail = line.partition(b' ')
            blobhash, sep, tail = tail.partition(b' ')
            stageno, sep, tail = tail.partition(b'\t')
            filename = tail.strip()
            if filename not in filelist:
                continue
            # This file will contain the hash of the cleaned object
            hashfile = pathjoin(self.gitdir, b'fat', b'index-filter', blobhash)
            try:
                cleanedobj = open(hashfile, 'wb').read().rstrip()
            except IOError:
                catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE)
                hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                def dofilter():
                    self.filter_clean(catfile.stdout, hashobject.stdin)
                    hashobject.stdin.close()
                filterclean = threading.Thread(target=dofilter)
                filterclean.start()
                cleanedobj = hashobject.stdout.read().rstrip()
                catfile.wait()
                hashobject.wait()
                filterclean.join()
                mkdir_p(os.path.dirname(hashfile))
                open(hashfile, 'wb').write(cleanedobj + b'\n')
            updateindex.stdin.write(mode + b' ' + cleanedobj + b' ' + stageno + b'\t' + filename + b'\n')
        if manage_gitattributes:
            try:
                mode, blobsha1, stageno, filename = subprocess.check_output(['git', 'ls-files', '-s', '.gitattributes']).split()
                gitattributes_lines = subprocess.check_output(['git', 'cat-file', 'blob', blobsha1]).splitlines()
            except ValueError:  # Nothing to unpack, thus no file
                mode, stageno = b'100644', b'0'
                gitattributes_lines = []
            gitattributes_extra = [line.split()[0] + b' filter=fat -text' for line in filelist]
            hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
            stdout, stderr = hashobject.communicate(b'\n'.join(gitattributes_lines + gitattributes_extra) + b'\n')
            updateindex.stdin.write(mode + b' ' + stdout.strip() + b' ' + stageno + b'\t.gitattributes\n')
        updateindex.stdin.close()
        lsfiles.wait()
        updateindex.wait()


if __name__ == '__main__':
    fat = GitFat()
    cmd = sys.argv[1] if len(sys.argv) > 1 else ''
    if cmd == 'filter-clean':
        fat.cmd_filter_clean()
    elif cmd == 'filter-smudge':
        fat.cmd_filter_smudge()
    elif cmd == 'init':
        fat.cmd_init()
    elif cmd == 'status':
        fat.cmd_status(sys.argv[2:])
    elif cmd == 'push':
        fat.cmd_push(sys.argv[2:])
    elif cmd == 'pull':
        fat.cmd_pull(sys.argv[2:])
    elif cmd == 'gc':
        fat.cmd_gc()
    elif cmd == 'checkout':
        fat.cmd_checkout(sys.argv[2:])
    elif cmd == 'find':
        fat.cmd_find(sys.argv[2:])
    elif cmd == 'index-filter':
        fat.cmd_index_filter(sys.argv[2:])
    else:
        print('Usage: git fat [init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr)
	#!/usr/bin/env python
	# -- mode: python --

	from __future__ import print_function, with_statement

	import sys
	import hashlib
	import tempfile
	import os
	import subprocess
	import shlex
	import shutil
	import itertools
	import threading
	import time
	import collections
	import posixpath

	BLOCK_SIZE = 4096

	def pathjoin (path, *paths):
	return posixpath.join(path,*paths)

	def make_string (s):
	if isinstance(s, bytes):
	return s.decode("utf-8")
	else:
	return s

	def make_buffer (s):
	if isinstance(s, bytes):
	return s
	else:
	return s.encode("utf-8")

	def verbose_stderr(args, *kwargs):
	return print(map(make_string,args), file=sys.stderr, *kwargs)
	def verbose_ignore(args, *kwargs):
	pass

	def ensure_binary_mode(stream):
	try: # Attempt the Python-3 way, also needed to handle unicode
	return stream.detach()
	except:
	pass
	if sys.platform == "win32":
	# Fall back to Python-2 way, only needed on Windows
	import msvcrt
	msvcrt.setmode(stream.fileno(), os.O_BINARY)
	return stream

	def mkdir_p(path):
	import errno
	try:
	os.makedirs(path)
	except OSError as exc: # Python >2.5
	if exc.errno == errno.EEXIST and os.path.isdir(path):
	pass
	else: raise

	def readblocks(stream):
	bytecount = 0
	while True:
	data = stream.read(BLOCK_SIZE)
	bytecount += len(data)
	if not data:
	break
	yield data
	def cat_iter(initer, outstream):
	for block in initer:
	outstream.write(block)
	def cat(instream, outstream):
	return cat_iter(readblocks(instream), outstream)
	def difftreez_reader(input):
	"""Incremental reader for git diff-tree -z output

	:oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ...
	"""
	buffer = []
	partial = b''
	while True:
	newread = input.read(BLOCK_SIZE)
	if not newread:
	break
	partial += newread
	while True:
	head, sep, partial = partial.partition(b'\0')
	if not sep:
	partial = head
	break
	buffer.append(head)
	if len(buffer) == 2:
	oldmode, newmode, oldhash, newhash, modflag = buffer[0].split()
	path = buffer[1]
	yield (newhash, modflag, path)
	buffer = []
	def gitconfig_get(name, file=None):
	args = ['git', 'config', '--get']
	if file is not None:
	args += ['--file', make_string(file)]
	args.append(make_string(name))
	p = subprocess.Popen(args, stdout=subprocess.PIPE)
	output = p.communicate()[0].strip()
	if p.returncode != 0:
	return None
	else:
	return output
	def gitconfig_set(name, value, file=None):
	args = ['git', 'config']
	if file is not None:
	args += ['--file', file]
	args += [name, value]
	p = subprocess.check_call(args)

	class GitFat(object):
	DecodeError = RuntimeError
	def __init__(self):
	self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore
	self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
	self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip()
	self.objdir = pathjoin(self.gitdir, b'fat', b'objects')
	if os.environ.get('GIT_FAT_VERSION') == '1':
	self.encode = self.encode_v1
	else:
	self.encode = self.encode_v2
	def magiclen(enc):
	return len(enc(hashlib.sha1(b'dummy').hexdigest().encode('ASCII'), 5))
	self.magiclen = magiclen(self.encode) # Current version
	self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions
	def setup(self):
	mkdir_p(self.objdir)
	def get_rsync(self):
	cfgpath = pathjoin(self.gitroot, b'.gitfat')
	remote = gitconfig_get('rsync.remote', file=cfgpath)
	ssh_port = gitconfig_get('rsync.sshport', file=cfgpath)
	ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath)
	if remote is None:
	raise RuntimeError('No rsync.remote in %s' % cfgpath)
	return remote, ssh_port, ssh_user
	def get_rsync_command(self,push):
	(remote, ssh_port, ssh_user) = self.get_rsync()
	if push:
	self.verbose('Pushing to:', remote)
	else:
	self.verbose('Pulling from:', remote)

	cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-']
	rshopts = ''
	if ssh_user:
	rshopts += ' -l ' + make_string(ssh_user)
	if ssh_port:
	rshopts += ' -p ' + make_string(ssh_port)
	if rshopts:
	cmd.append('--rsh=plink' + rshopts)
	if push:
	cmd += [make_string(self.objdir) + '/', make_string(remote) + '/']
	else:
	cmd += [make_string(remote) + '/', make_string(self.objdir) + '/']
	return cmd
	def revparse(self, revname):
	return subprocess.check_output(['git', 'rev-parse', revname]).strip()
	def encode_v1(self, digest, bytecount):
	'Produce legacy representation of file to be stored in repository.'
	return (b'#$# git-fat ' + digest + b'\n')
	def encode_v2(self, digest, bytecount):
	'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.'
	return (b'#$# git-fat ' + digest + (' %20d\n' % (bytecount,)).encode('ASCII'))
	def decode(self, bstring, noraise=False):
	cookie = b'#$# git-fat '
	if bstring.startswith(cookie):
	parts = bstring[len(cookie):].split()
	digest = parts[0]
	bytecount = int(parts[1]) if len(parts) > 1 else None
	return digest, int(bytecount)
	elif noraise:
	return None, None
	else:
	raise GitFat.DecodeError('Could not decode %s' % repr(bstring))
	def decode_stream(self, stream):
	'Return digest if git-fat cache, otherwise return iterator over entire file contents'
	preamble = stream.read(self.magiclen)
	try:
	return self.decode(preamble)
	except GitFat.DecodeError:
	# Not sure if this is the right behavior
	return itertools.chain([preamble], readblocks(stream)), None
	def decode_file(self, fname):
	# Fast check
	stat = os.stat(fname)
	if stat.st_size != self.magiclen:
	return False, None
	# read file
	digest, bytecount = self.decode_stream(open(fname, 'rb'))
	if isinstance(digest, bytes):
	return digest, bytecount
	else:
	return None, bytecount
	def decode_clean(self, body):
	'''
	Attempt to decode version in working tree. The tree version could be changed to have a more
	useful message than the machine-readable copy that goes into the repository. If the tree
	version decodes successfully, it indicates that the fat data is not currently available in
	this repository.
	'''
	digest, bytecount = self.decode(body, noraise=True)
	return digest
	def filter_clean(self, instream, outstreamclean):
	h = hashlib.new('sha1')
	bytecount = 0
	# mkstemp requires 'str' rather than native filesystem bytes
	fd, tmpname = tempfile.mkstemp(dir=self.objdir.decode(sys.getfilesystemencoding()))
	try:
	ishanging = False
	cached = False # changes to True when file is cached
	with os.fdopen(fd, 'wb') as cache:
	outstream = cache
	blockiter = readblocks(instream)
	firstblock = True
	for block in readblocks(instream):
	if firstblock:
	if len(block) == self.magiclen and self.decode_clean(block[0:self.magiclen]):
	ishanging = True # Working tree version is verbatim from repository (not smudged)
	outstream = outstreamclean
	firstblock = False
	h.update(block)
	bytecount += len(block)
	outstream.write(block)
	outstream.flush()
	digest = h.hexdigest().encode('ASCII')
	objfile = pathjoin(self.objdir, digest)
	if not ishanging:
	if os.path.exists(objfile):
	self.verbose('git-fat filter-clean: cache already exists:', objfile)
	os.remove(tmpname)
	else:
	os.rename(make_string(tmpname), make_string(objfile))
	self.verbose('git-fat filter-clean: caching to:', objfile)
	cached = True
	outstreamclean.write(self.encode(digest, bytecount))
	finally:
	if not cached:
	os.remove(tmpname)

	def cmd_filter_clean(self):
	'''
	The clean filter runs when a file is added to the index. It gets the "smudged" (tree)
	version of the file on stdin and produces the "clean" (repository) version on stdout.
	'''
	self.setup()
	# Set stdin and stdout to binary mode
	sys.stdin = ensure_binary_mode(sys.stdin)
	sys.stdout = ensure_binary_mode(sys.stdout)
	self.filter_clean(sys.stdin, sys.stdout)

	def cmd_filter_smudge(self):
	self.setup()
	# Ensure streams are treated as binary
	sys.stdin = ensure_binary_mode(sys.stdin)
	sys.stdout = ensure_binary_mode(sys.stdout)
	result, bytecount = self.decode_stream(sys.stdin)
	if isinstance(result, bytes): # We got a digest
	objfile = pathjoin(self.objdir, result)
	try:
	self.verbose('git-fat filter-smudge: restoring from:', objfile)
	cat(open(objfile, 'rb'), sys.stdout)
	except IOError: # file not found
	self.verbose('git-fat filter-smudge: fat object missing:', objfile)
	sys.stdout.write(self.encode(result, bytecount)) # could leave a better notice about how to recover this file
	else: # We have an iterable over the original input.
	self.verbose('git-fat filter-smudge: not a managed file')
	cat_iter(result, sys.stdout)
	def catalog_objects(self):
	return set(os.listdir(self.objdir))
	def referenced_objects(self, rev=None, all=False):
	referenced = set()
	if all:
	rev = '--all'
	elif rev is None:
	rev = self.revparse('HEAD')

	rev = make_string(rev)

	p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
	p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	def cut_sha1hash(input, output):
	for line in input:
	output.write(line.split()[0] + b'\n')
	output.close()
	cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
	cut_thread.start()
	for line in p2.stdout:
	objhash, objtype, size = line.split()
	if objtype == b'blob' and int(size) in self.magiclens:
	try:
	objhash = make_string(objhash)
	fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0]
	referenced.add(fathash)
	except GitFat.DecodeError:
	pass
	cut_thread.join()
	p1.wait()
	p2.wait()
	return referenced
	def orphan_files(self):
	'generator for all orphan placeholders in the working tree'
	for fname in subprocess.check_output(['git', 'ls-files']).splitlines():
	digest = self.decode_file(fname)[0]
	if digest:
	yield (digest, fname)
	def cmd_status(self, args):
	self.setup()
	catalog = self.catalog_objects()
	refargs = dict()
	if '--all' in args:
	refargs['all'] = True
	referenced = self.referenced_objects(**refargs)
	garbage = catalog - referenced
	orphans = referenced - catalog
	if '--all' in args:
	for obj in referenced:
	print(obj)
	if orphans:
	print('Orphan objects:')
	for orph in orphans:
	print(' ' + orph)
	if garbage:
	print('Garbage objects:')
	for g in garbage:
	print(' ' + g)
	def is_dirty(self):
	return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0
	def cmd_push(self, args):
	'Push anything that I have stored and referenced'
	self.setup()
	# Default to push only those objects referenced by current HEAD
	# (includes history). Finer-grained pushing would be useful.
	pushall = '--all' in args
	files = self.referenced_objects(all=pushall) & self.catalog_objects()
	cmd = self.get_rsync_command(push=True)
	self.verbose('Executing:', ' '.join(cmd))
	p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
	p.communicate(input=b'\x00'.join(files))
	def checkout(self, show_orphans=False):
	'Update any stale files in the present working tree'
	for digest, fname in self.orphan_files():
	objpath = pathjoin(self.objdir, digest)
	if os.access(objpath, os.R_OK):
	print((b'Restoring ' + digest + b' -> ' + fname).decode(sys.getfilesystemencoding()))
	# The output of our smudge filter depends on the existence of
	# the file in .git/fat/objects, but git caches the file stat
	# from the previous time the file was smudged, therefore it
	# won't try to re-smudge. I don't know a git command that
	# specifically invalidates that cache, but touching the file
	# also does the trick.
	os.utime(fname, None)
	# This re-smudge is essentially a copy that restores permissions.
	subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname])
	elif show_orphans:
	print('Data unavailable: %s %s' % (digest,fname))
	def cmd_pull(self, args):
	'Pull anything that I have referenced, but not stored'
	self.setup()
	refargs = dict()
	if '--all' in args:
	refargs['all'] = True
	for arg in args:
	if arg.startswith('-') or len(arg) != 40:
	continue
	rev = self.revparse(arg)
	if rev:
	refargs['rev'] = rev
	files = self.referenced_objects(**refargs) - self.catalog_objects()
	cmd = self.get_rsync_command(push=False)
	self.verbose('Executing:', b' '.join(cmd))
	p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
	p.communicate(input=(b'\x00'.join(files)))
	self.checkout()
	def cmd_checkout(self, args):
	self.checkout(show_orphans=True)
	def cmd_gc(self):
	garbage = self.catalog_objects() - self.referenced_objects()
	print('Unreferenced objects to remove: %d' % len(garbage))
	for obj in garbage:
	fname = pathjoin(self.objdir, obj)
	print('%10d %s' % (os.stat(fname).st_size, obj))
	os.remove(fname)
	def cmd_init(self):
	self.setup()
	if gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge'):
	print('Git fat already configured, check configuration in .git/config')
	else:
	gitconfig_set('filter.fat.clean', 'git-fat filter-clean')
	gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge')
	print('Initialized git fat')
	def gen_large_blobs(self, revs, threshsize):
	"""Build dict of all blobs"""
	time0 = time.time()
	def hash_only(input, output):
	"""The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags.
	This truncates to one hash per line.
	"""
	for line in input:
	output.write(line[:40] + b'\n')
	output.close()
	revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1)
	objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
	hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin))
	hashonly.start()
	numblobs = 0; numlarge = 0
	# Build dict with the sizes of all large blobs
	for line in objcheck.stdout:
	objhash, blob, size = line.split()
	if blob != b'blob':
	continue
	size = int(size)
	numblobs += 1
	if size > threshsize:
	numlarge += 1
	yield objhash, size
	revlist.wait()
	objcheck.wait()
	hashonly.join()
	time1 = time.time()
	self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
	def cmd_find(self, args):
	# FIXME: Need input validation here
	maxsize = int(args[0])
	blobsizes = dict(self.gen_large_blobs('--all', maxsize))
	time0 = time.time()
	# Find all names assumed by large blobs (those in blobsizes)
	pathsizes = collections.defaultdict(lambda:set())
	revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1)
	difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'],
	stdin=revlist.stdout, stdout=subprocess.PIPE)
	for newblob, modflag, path in difftreez_reader(difftree.stdout):
	bsize = blobsizes.get(newblob)
	path = path.decode(sys.getfilesystemencoding())
	if bsize: # We care about this blob
	pathsizes[path].add(bsize)
	time1 = time.time()
	self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0))
	maxlen = max(map(len,pathsizes)) if pathsizes else 0
	for path, sizes in sorted(pathsizes.items(), key=lambda item: max(item[1]), reverse=True):
	print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
	revlist.wait()
	difftree.wait()
	def cmd_index_filter(self, args):
	# FIXME: Need input validation here
	manage_gitattributes = '--manage-gitattributes' in args
	filelist = set(f.strip() for f in open(args[0], 'rb').readlines())
	lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE)
	updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE)
	for line in lsfiles.stdout:
	mode, sep, tail = line.partition(b' ')
	blobhash, sep, tail = tail.partition(b' ')
	stageno, sep, tail = tail.partition(b'\t')
	filename = tail.strip()
	if filename not in filelist:
	continue
	# This file will contain the hash of the cleaned object
	hashfile = pathjoin(self.gitdir, b'fat', b'index-filter', blobhash)
	try:
	cleanedobj = open(hashfile, 'wb').read().rstrip()
	except IOError:
	catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE)
	hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	def dofilter():
	self.filter_clean(catfile.stdout, hashobject.stdin)
	hashobject.stdin.close()
	filterclean = threading.Thread(target=dofilter)
	filterclean.start()
	cleanedobj = hashobject.stdout.read().rstrip()
	catfile.wait()
	hashobject.wait()
	filterclean.join()
	mkdir_p(os.path.dirname(hashfile))
	open(hashfile, 'wb').write(cleanedobj + b'\n')
	updateindex.stdin.write(mode + b' ' + cleanedobj + b' ' + stageno + b'\t' + filename + b'\n')
	if manage_gitattributes:
	try:
	mode, blobsha1, stageno, filename = subprocess.check_output(['git', 'ls-files', '-s', '.gitattributes']).split()
	gitattributes_lines = subprocess.check_output(['git', 'cat-file', 'blob', blobsha1]).splitlines()
	except ValueError: # Nothing to unpack, thus no file
	mode, stageno = b'100644', b'0'
	gitattributes_lines = []
	gitattributes_extra = [line.split()[0] + b' filter=fat -text' for line in filelist]
	hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	stdout, stderr = hashobject.communicate(b'\n'.join(gitattributes_lines + gitattributes_extra) + b'\n')
	updateindex.stdin.write(mode + b' ' + stdout.strip() + b' ' + stageno + b'\t.gitattributes\n')
	updateindex.stdin.close()
	lsfiles.wait()
	updateindex.wait()


	if __name__ == '__main__':
	fat = GitFat()
	cmd = sys.argv[1] if len(sys.argv) > 1 else ''
	if cmd == 'filter-clean':
	fat.cmd_filter_clean()
	elif cmd == 'filter-smudge':
	fat.cmd_filter_smudge()
	elif cmd == 'init':
	fat.cmd_init()
	elif cmd == 'status':
	fat.cmd_status(sys.argv[2:])
	elif cmd == 'push':
	fat.cmd_push(sys.argv[2:])
	elif cmd == 'pull':
	fat.cmd_pull(sys.argv[2:])
	elif cmd == 'gc':
	fat.cmd_gc()
	elif cmd == 'checkout':
	fat.cmd_checkout(sys.argv[2:])
	elif cmd == 'find':
	fat.cmd_find(sys.argv[2:])
	elif cmd == 'index-filter':
	fat.cmd_index_filter(sys.argv[2:])
	else:
	print('Usage: git fat [init\|status\|push\|pull\|gc\|checkout\|find\|index-filter]', file=sys.stderr)