Created
August 15, 2022 18:37
-
-
Save Erotemic/7d6b23109c74941b3810bf85499ed028 to your computer and use it in GitHub Desktop.
DVC Clean POC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def dvc_clean(): | |
import ubelt as ub | |
import yaml | |
dvc_root = ub.Path('.').resolve() | |
subdir = dvc_root / 'MySubdirectory' | |
ub.Path(ub.cmd('dvc cache dir', cwd=dvc_root, check=True)['out'].strip()) | |
def list_subdir_dvc_fpaths(subdir): | |
dvc_fpaths = [] | |
prog = ub.ProgIter(desc='walking') | |
prog.begin() | |
for r, ds, fs in subdir.walk(): | |
prog.step() | |
for f in fs: | |
if f.endswith('.dvc'): | |
dvc_fpath = r / f | |
dvc_fpaths.append(dvc_fpath) | |
regular_name = f[:-4] | |
if regular_name in ds: | |
ds.remove(regular_name) | |
prog.end() | |
return dvc_fpaths | |
def find_tracked_files(dvc_fpath): | |
with open(dvc_fpath, 'r') as file: | |
dvc_data = yaml.safe_load(file) | |
tracked_path = dvc_fpath.augment(ext='') | |
new_outs = [] | |
outs = dvc_data['outs'] | |
for out in outs: | |
hash_suffix = out['md5'] | |
if hash_suffix.endswith('.dir'): | |
cache_fpath = (dvc_cache_dir / hash_suffix[0:2] / hash_suffix[2:]) | |
with open(cache_fpath, 'r') as file: | |
dir_outs = yaml.safe_load(file) | |
for o in dir_outs: | |
o['path'] = tracked_path / o['relpath'] | |
new_outs.extend(dir_outs) | |
else: | |
new_outs.append(out) | |
tracked_files = [] | |
for out in new_outs: | |
hash_suffix = out['md5'] | |
cache_fpath = (dvc_cache_dir / hash_suffix[0:2] / hash_suffix[2:]) | |
real_fpath = dvc_fpath.parent / out['path'] | |
tracked_files.append({ | |
'real_fpath': real_fpath, | |
'cache_fpath': cache_fpath, | |
}) | |
return tracked_files | |
dvc_fpaths = list_subdir_dvc_fpaths(subdir) | |
all_tracked = [] | |
for dvc_fpath in ub.ProgIter(dvc_fpaths): | |
tracked = find_tracked_files(dvc_fpath) | |
all_tracked.extend(tracked) | |
to_remove = [] | |
for info in all_tracked: | |
real = info['real_fpath'] | |
cached = info['cache_fpath'] | |
if real.exists() and cached.exists(): | |
if real.is_symlink(): | |
if real.samefile(cached): | |
to_remove.append(real) | |
to_remove.append(cached) | |
else: | |
print("Real is not the same as cached") | |
else: | |
print("Not a symlink: {}".format(real)) | |
for fpath in to_remove: | |
fpath.delete() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment