Skip to content

Instantly share code, notes, and snippets.

@bgribble
Last active December 12, 2017 18:48
Show Gist options
  • Save bgribble/0931a7ca4f994031813b0e80e57f3706 to your computer and use it in GitHub Desktop.
Save bgribble/0931a7ca4f994031813b0e80e57f3706 to your computer and use it in GitHub Desktop.
Split a source file into chunks, preserving git history
#! /usr/bin/env python3
'''
git-chunks - split a source file into multiple chunks, preserving Git
history of the source text
You have a too-big file foo.py that is old and crufty. In
your editor split it into multiple files foo-bar.py,
foo-baz.py, and foo-quux.py, with some desiccated remains
in foo.py still. So from Git's perspective you have unstaged
changes in foo.py and untracked files foo-bar.py, foo-baz.py,
and foo-quux.py.
If you just "git add" those 3, "git blame" will forget all
about the history of the contents of those files and show you as
the author on today's date.
So:
git-chunks.py foo.py foo-bar.py foobaz.py foo-quux.py
After that, you will still have unstaged changes in foo.py
and the other 3 files will be committed with intact history.
Inspiration from:
https://beyermatthias.de/blog/2014/09/24/splitting-files-while-preserving-history-in-git/
'''
import argparse
import random
import string
from subprocess import getoutput
debug = False
dry_run = False
def shelly(cmd, force_run=False):
global debug
global dry_run
output = None
if force_run or not dry_run:
output = getoutput(cmd)
if debug and not dry_run:
print("-------------------------------")
if debug or dry_run:
print(" [shell] %s" % cmd)
if debug and not dry_run:
for l in output.strip().split('\n'):
print(" ", l)
return output
def salty(salt_size):
return ''.join(random.choices(string.ascii_uppercase, k=salt_size))
def safe_name(name):
return name.replace('/', '-')
def add_chunk(source_file, chunk_file):
print(" Adding chunk file %s" % chunk_file)
# get current branch name
orig_branch = shelly("git status | head -1 | cut -d ' ' -f 3-", force_run=True)
chunk_branch = '%s_chunk-%s_%s' % (orig_branch, safe_name(chunk_file), salty(6))
# save chunk file out of the way
shelly("mv %s .%s_chunk_tmp" % (chunk_file, safe_name(chunk_file)))
# checkout a new branch
shelly("git checkout -b %s" % chunk_branch)
# rename the source file
shelly("git mv %s %s" % (source_file, chunk_file))
# commit
shelly("git commit -m 'Create %s from %s'" % (chunk_file, source_file))
# copy over the saved chunk file
shelly("mv .%s_chunk_tmp %s" % (safe_name(chunk_file), chunk_file))
# commit
shelly("git add %s" % chunk_file)
shelly("git commit -m 'Save chunk %s'" % chunk_file)
# checkout original branch
shelly("git checkout %s" % orig_branch)
# merge the chunk branch
shelly("git merge --no-commit --no-ff %s" % chunk_branch)
# get back the deleted file
shelly("git checkout HEAD %s" % source_file)
shelly("git checkout --theirs %s" % chunk_file)
shelly("git commit -m 'Merge chunk %s of %s'" % (chunk_file, source_file))
# remove the temp branch
shelly("git branch -d %s" % chunk_branch)
def split_chunks(source_file, chunk_files):
print("Splitting original file %s into chunks" % source_file)
# save state of original file (should be uncommitted)
shelly("mv %s .%s_chunk_tmp" % (source_file, source_file))
# restore source file HEAD as basis for branches
shelly("git checkout %s" % source_file)
# add all the chunk files
for chunk in chunk_files:
add_chunk(source_file, chunk)
# restore the edited version of the source file (will leave it
# with uncommitted changes)
shelly("mv .%s_chunk_tmp %s" % (source_file, source_file))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Split file into multiple chunks")
parser.add_argument('--debug', action="store_true", help="Print extra debug info")
parser.add_argument('--dry-run', action="store_true", help="Don't execute commands")
parser.add_argument('source_file', help='Original file to split')
parser.add_argument('chunk_files', nargs='+',
help='Paths to untracked working files containing chunks')
args = vars(parser.parse_args())
debug = args.get('debug')
dry_run = args.get('dry_run')
if debug:
print("Arguments:", args)
start_commit = shelly("git log | head -1 | cut -d ' ' -f 2", force_run=True)
print("HEAD before starting is commit %s" % start_commit)
split_chunks(args.get('source_file'), args.get('chunk_files'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment