Last active
May 26, 2024 18:34
-
-
Save mvdoc/c46e050bda45d3cb5b36ed40c77f2c24 to your computer and use it in GitHub Desktop.
Compute total size of git-annexed files with only one local copy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script computes the total size of git-annex files with only a single local copy. | |
# It's useful to figure out how much data will be used if all the files were to be archived. | |
import subprocess | |
from tqdm import tqdm | |
import json | |
import os | |
def get_files_with_one_copy(): | |
try: | |
result = subprocess.run(['git-annex', 'find', '--copies=1', '--and', '--not', '--copies=2', '--and', '--in=here'], capture_output=True, text=True, check=True) | |
files = result.stdout.splitlines() | |
return files | |
except subprocess.CalledProcessError as e: | |
print(f"Error finding files: {e}") | |
return [] | |
def get_file_size(file): | |
try: | |
# result = subprocess.run(['git-annex', 'info', file, '--json', '--bytes', '--fast'], capture_output=True, text=True, check=True) | |
# info = json.loads(result.stdout) | |
# return int(info['size']) | |
result = subprocess.run(['du', '-bL', file], capture_output=True, text=True, check=True) | |
return int(result.stdout.split()[0]) | |
except subprocess.CalledProcessError as e: | |
print(f"Error getting info for {file}: {e}") | |
return 0 | |
except (json.JSONDecodeError, KeyError) as e: | |
print(f"Error parsing info for {file}: {e}") | |
return 0 | |
def main(): | |
files = get_files_with_one_copy() | |
total_size = 0 | |
for file in tqdm(files, desc="Processing files"): | |
size = get_file_size(file) | |
total_size += size | |
human_readable_size = subprocess.run(['numfmt', '--to=iec-i', '--suffix=B', str(total_size)], capture_output=True, text=True).stdout.strip() | |
n_files = len(files) | |
print(f"Total size of {n_files} files with only one copy: {human_readable_size}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment