Skip to content

Instantly share code, notes, and snippets.

@mvdoc
Last active May 26, 2024 18:34
Show Gist options
  • Save mvdoc/c46e050bda45d3cb5b36ed40c77f2c24 to your computer and use it in GitHub Desktop.
Save mvdoc/c46e050bda45d3cb5b36ed40c77f2c24 to your computer and use it in GitHub Desktop.
Compute total size of git-annexed files with only one local copy
# This script computes the total size of git-annex files with only a single local copy.
# It's useful to figure out how much data will be used if all the files were to be archived.
import subprocess
from tqdm import tqdm
import json
import os
def get_files_with_one_copy():
try:
result = subprocess.run(['git-annex', 'find', '--copies=1', '--and', '--not', '--copies=2', '--and', '--in=here'], capture_output=True, text=True, check=True)
files = result.stdout.splitlines()
return files
except subprocess.CalledProcessError as e:
print(f"Error finding files: {e}")
return []
def get_file_size(file):
try:
# result = subprocess.run(['git-annex', 'info', file, '--json', '--bytes', '--fast'], capture_output=True, text=True, check=True)
# info = json.loads(result.stdout)
# return int(info['size'])
result = subprocess.run(['du', '-bL', file], capture_output=True, text=True, check=True)
return int(result.stdout.split()[0])
except subprocess.CalledProcessError as e:
print(f"Error getting info for {file}: {e}")
return 0
except (json.JSONDecodeError, KeyError) as e:
print(f"Error parsing info for {file}: {e}")
return 0
def main():
files = get_files_with_one_copy()
total_size = 0
for file in tqdm(files, desc="Processing files"):
size = get_file_size(file)
total_size += size
human_readable_size = subprocess.run(['numfmt', '--to=iec-i', '--suffix=B', str(total_size)], capture_output=True, text=True).stdout.strip()
n_files = len(files)
print(f"Total size of {n_files} files with only one copy: {human_readable_size}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment