Skip to content

Instantly share code, notes, and snippets.

@sueszli
Last active April 19, 2024 13:44
Show Gist options
  • Save sueszli/c8bd7ec5d821e281be9cabcf2fa51fef to your computer and use it in GitHub Desktop.
Save sueszli/c8bd7ec5d821e281be9cabcf2fa51fef to your computer and use it in GitHub Desktop.
bypassing github storage service
import hashlib
import sys
import pathlib
import subprocess
"""
github commits are restricted to 25-50 MiB, varying based on the push method [^1].
to handle files beyond this limit, git lfs (large file storage) pointers are necessary, referencing an external lfs server [^2].
however, this method incurs a monthly cloud storage fee to github [^3].
this is a failed attempt at bypassing the file size limit by committing a large file in small chunks:
> remote: warning: File huge-ass-file.tar is 60.00 MB; this is larger than GitHub's recommended maximum file size of 50.00 MB
> remote: error: Trace: 2fa983a46f7b5205ea9bbef6e118069f7426f07618935e67ed6225df9647d617
> remote: error: See https://gh.io/lfs for more information.
> ...
> remote: error: File huge-ass-file.tar is 150.00 MB; this exceeds GitHub's file size limit of 100.00 MB
> remote: error: File huge-ass-file.tar is 200.00 MB; this exceeds GitHub's file size limit of 100.00 MB
[^1]: docs: https://docs.github.com/en/repositories/working-with-files/managing-large-files/about-large-files-on-github#file-size-limits
[^2]: nice comment: wokwokwok, 2021 on hackernews, https://news.ycombinator.com/item?id=27134972#:~:text=of%20such%20projects-,wokwokwok,-on%20May%2013
[^3]: https://docs.github.com/en/billing/managing-billing-for-git-large-file-storage/about-billing-for-git-large-file-storage
"""
def assert_matching_checksums(filepath1: pathlib.Path, filepath2: pathlib.Path) -> None:
print(f"verifying checksums...")
checksum1 = hashlib.md5(pathlib.Path(filepath1).read_bytes()).hexdigest()
checksum2 = hashlib.md5(pathlib.Path(filepath2).read_bytes()).hexdigest()
assert checksum1 == checksum2, f"checksums do not match: {checksum1} != {checksum2}"
print(f"checksums match: {checksum1} == {checksum2}")
def assert_matching_filesizes(filepath1: pathlib.Path, filepath2: pathlib.Path) -> None:
print(f"verifying file sizes...")
filesize1 = filepath1.stat().st_size
filesize2 = filepath2.stat().st_size
assert filesize1 == filesize2, f"file sizes do not match: {filesize1} != {filesize2}"
print(f"file sizes match: {filesize1} == {filesize2}")
if __name__ == "__main__":
file = pathlib.Path(sys.argv[1])
assert pathlib.Path(".git").exists(), "put this script inside the git directory you want to copy the file to"
assert file.exists(), f"file does not exist: {file}"
assert file.is_file(), f"not a file: {file}"
assert not any([sibling.name == ".git" for sibling in list(file.parent.glob("*"))]), f"{file} should not be in a .git directory"
filesize = file.stat().st_size
print(f"{file.name} size: {file.stat().st_size}")
print(f"copying and committing chunks to github...")
chunk_size = 30 * 1024 * 1024
num_chunks = (file.stat().st_size // chunk_size) + 1
with open(file.name, "wb") as f:
pass
for i in range(num_chunks):
with open(file, "rb") as f:
# read
f.seek(i * chunk_size)
chunk = f.read(chunk_size)
if not chunk:
print(f"no more chunks to read at iteration {i}")
break
# append to file in this directory
with open(file.name, "ab") as g:
g.write(chunk)
# push to github
subprocess.run(["git", "add", file.name])
subprocess.run(["git", "commit", "-m", f"git lfs exploit auto commit: {file.name} - {i}/{num_chunks}"])
subprocess.run(["git", "push"])
print(f"\033[92mprogress: {i}/{num_chunks} \033[0m")
assert_matching_checksums(file, pathlib.Path(file.name))
assert_matching_filesizes(file, pathlib.Path(file.name))
print(f"finished! {file.name} pushed to github")
@sueszli
Copy link
Author

sueszli commented Apr 19, 2024

here's what i meant:

import hashlib
import sys
import shutil
import pathlib
import subprocess


def split():
    filepath = pathlib.Path(sys.argv[1])

    assert pathlib.Path(".git").exists(), "put this script inside the git directory"
    assert filepath.exists(), f"file does not exist: {filepath}"
    assert filepath.is_file(), f"not a file: {filepath}"
    assert not any([sibling.name == ".git" for sibling in list(filepath.parent.glob("*"))]), f"{filepath} should not be in a .git directory"

    # create tmp directory
    tmp_dir_path = pathlib.Path("tmp")
    shutil.rmtree(tmp_dir_path, ignore_errors=True)
    tmp_dir_path.mkdir()
    print(f"created new directory: {tmp_dir_path}")

    # don't track tmp directory
    gitignore_path = pathlib.Path(".gitignore")
    gitignore_path.touch()
    already_ignored = any([line.strip() == f"{tmp_dir_path}/" for line in gitignore_path.read_text().split("\n")])
    if not already_ignored:
        with open(gitignore_path, "a") as f:
            f.write(f"{tmp_dir_path}/\n")
        print(f"added {tmp_dir_path} to .gitignore")
    subprocess.run(["git", "add", ".gitignore"])
    subprocess.run(["git", "commit", "-m", "lfs-hack auto commit: .gitignore"])
    subprocess.run(["git", "push"])

    # copy file to tmp directory
    shutil.copy(filepath, tmp_dir_path)
    print(f"copied {filepath.name} to {tmp_dir_path} directory")

    # create checksum file
    with open(tmp_dir_path / f"{filepath.name}.md5", "w") as f:
        f.write(hashlib.md5(pathlib.Path(tmp_dir_path / filepath.name).read_bytes()).hexdigest())
    print(f"created checksum file: *.md5")

    # split file into chunks in tmp directory
    chunk_size = 50 * 1024 * 1024
    num_chunks = (filepath.stat().st_size // chunk_size) + 1
    for i in range(num_chunks):
        with open(tmp_dir_path / f"{filepath.name}.{i}", "wb") as f:
            f.write((tmp_dir_path / filepath.name).read_bytes()[i * chunk_size : (i + 1) * chunk_size])
    (tmp_dir_path / filepath.name).unlink()
    print(f"finished creating chunks")

    # create data file
    data_dir_path = pathlib.Path("data")
    shutil.rmtree(data_dir_path, ignore_errors=True)
    data_dir_path.mkdir()
    print(f"created new directory: {data_dir_path}")

    # copy each chunk into data directory, commit, and push
    for i in range(num_chunks):
        chunkfile_path = tmp_dir_path / f"{filepath.name}.{i}"
        shutil.copy(chunkfile_path, data_dir_path)
        subprocess.run(["git", "add", data_dir_path / chunkfile_path.name])
        subprocess.run(["git", "commit", "-m", f"lfs-hack auto commit: {chunkfile_path}"])
        subprocess.run(["git", "push"])
        print(f"\033[92mprogress: {i}/{num_chunks} \033[0m")
    print(f"finished pushing chunks")

    # upload checksum file
    shutil.copy(tmp_dir_path / f"{filepath.name}.md5", data_dir_path)
    subprocess.run(["git", "add", data_dir_path / f"{filepath.name}.md5"])
    subprocess.run(["git", "commit", "-m", f"lfs-hack auto commit: {filepath.name}.md5"])
    subprocess.run(["git", "push"])
    print(f"finished pushing checksum file")


def merge():
    data_dir_path = pathlib.Path("data")

    assert data_dir_path.exists(), f"directory does not exist: {data_dir_path}"
    for file in data_dir_path.iterdir():
        suffix = file.name.split(".")[-1]
        assert suffix.isdigit() or suffix == "md5", f"unexpected file: {file}"

    # merge all chunks together
    data_file_paths = sorted(list(data_dir_path.glob("*")))
    chunk_files = [file for file in data_file_paths if file.suffix != ".md5"]
    dst_path = data_dir_path / data_file_paths[0].stem
    with open(dst_path, "wb") as f:
        for chunk_file in chunk_files:
            f.write(chunk_file.read_bytes())
    print(f"finished merging {len(chunk_files)} chunks into {dst_path}")

    # verify checksum
    given_checksum = hashlib.md5(dst_path.read_bytes()).hexdigest()
    expected_checksum = (data_dir_path / f"{dst_path.name}.md5").read_text().strip()
    assert given_checksum == expected_checksum, f"checksum mismatch: {given_checksum} != {expected_checksum}"
    print(f"checksum verified: {given_checksum} == {expected_checksum}")

    # remove chunks
    for file in data_file_paths:
        file.unlink()
    print(f"removed all chunks")


if __name__ == "__main__":
    """
    step 1) split once: split a large file into chunks and then push them to github to bypass the 100mb limit.
    step 2) merge on every clone: merge the chunks back into the original file.
    """

    split()
    # merge()

@sueszli
Copy link
Author

sueszli commented Apr 19, 2024

for some reason with the version above the checksum doesn't match and the file is corrupted

here's a much nicer version:

split:

file_path=$1
if [ -z $file_path ]; then echo "file path not given"; exit 1; fi
if [ ! -f $file_path ]; then echo "file not found"; exit 1; fi
if [ ! -s $file_path ]; then echo "file is empty"; exit 1; fi
echo "file found: $file_path"

# validate .gitignore
if [ ! -f .gitignore ]; then echo ".gitignore not found"; exit 1; fi
if ! grep -q "tmp/" .gitignore; then echo "tmp/ not in .gitignore"; exit 1; fi
if ! grep -q "data-merged/" .gitignore; then echo "data-merged/ not in .gitignore"; exit 1; fi

# create tmp directory
rm -rf tmp
mkdir tmp
echo "created tmp directory"

# copy file to tmp directory
cp $file_path tmp
echo "copied $file_path to tmp directory"

# create checksum file
checksum=$(md5sum tmp/$(basename $file_path) | awk '{ print $1 }')
echo $checksum > tmp/$(basename $file_path).md5
echo "created checksum file: $(basename $file_path).md5"

# split file into chunks in tmp directory
chunk_size=$((50 * 1024 * 1024))
split -b $chunk_size tmp/$(basename $file_path) tmp/$(basename $file_path)-chunk-

# create data directory
rm -rf data
mkdir data
echo "created data directory"

# copy checksum
mv tmp/$(basename $file_path).md5 data

# iterate over chunks, push to git
num_chunks=$(ls tmp/$(basename $file_path)-chunk-* | wc -l)
counter=0
for chunk in tmp/$(basename $file_path)-chunk-*; do
    counter=$((counter + 1))
    progress_str=$(printf "%d/%d" $counter $num_chunks)
    
    mv $chunk data
    git add .
    git commit -m "auto commit: $(basename $chunk) $progress_str"
    git push

    echo "🟢 pushed $(basename $chunk) $progress_str"
done

echo "🟢 done"
exit 0

merge:

# validate .gitignore
if [ ! -f .gitignore ]; then echo ".gitignore not found"; exit 1; fi
if ! grep -q "tmp/" .gitignore; then echo "tmp/ not in .gitignore"; exit 1; fi
if ! grep -q "data-merged/" .gitignore; then echo "data-merged/ not in .gitignore"; exit 1; fi

# validate ./data/* files
if [ ! -d data ]; then echo "data/ directory not found"; exit 1; fi
if ! ls data/*-chunk-* &> /dev/null && ! ls data/*.md5 &> /dev/null; then echo "invalid files found in data/"; exit 1; fi

# create data-merged directory
rm -rf data-merged
mkdir data-merged
echo "created data-merged directory"

# merge chunks into data-merged directory
cat data/*-chunk-* > data-merged/merged.tar
echo "merged chunks into data-merged/merged.tar"

# validate checksum
expected_checksum=$(cat data/*.md5)
actual_checksum=$(md5sum data-merged/merged.tar | awk '{ print $1 }')
if [ $expected_checksum != $actual_checksum ]; then echo "checksum mismatch"; exit 1; fi
echo "checksum matched: $expected_checksum == $actual_checksum"

echo "🟢 done"
exit 0

@sueszli
Copy link
Author

sueszli commented Apr 19, 2024

update: the script above worked just fine and it was surprisingly easy to set up

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment