Skip to content

Instantly share code, notes, and snippets.

@Erotemic
Created August 15, 2022 18:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Erotemic/7d6b23109c74941b3810bf85499ed028 to your computer and use it in GitHub Desktop.
Save Erotemic/7d6b23109c74941b3810bf85499ed028 to your computer and use it in GitHub Desktop.
DVC Clean POC
def dvc_clean():
import ubelt as ub
import yaml
dvc_root = ub.Path('.').resolve()
subdir = dvc_root / 'MySubdirectory'
ub.Path(ub.cmd('dvc cache dir', cwd=dvc_root, check=True)['out'].strip())
def list_subdir_dvc_fpaths(subdir):
dvc_fpaths = []
prog = ub.ProgIter(desc='walking')
prog.begin()
for r, ds, fs in subdir.walk():
prog.step()
for f in fs:
if f.endswith('.dvc'):
dvc_fpath = r / f
dvc_fpaths.append(dvc_fpath)
regular_name = f[:-4]
if regular_name in ds:
ds.remove(regular_name)
prog.end()
return dvc_fpaths
def find_tracked_files(dvc_fpath):
with open(dvc_fpath, 'r') as file:
dvc_data = yaml.safe_load(file)
tracked_path = dvc_fpath.augment(ext='')
new_outs = []
outs = dvc_data['outs']
for out in outs:
hash_suffix = out['md5']
if hash_suffix.endswith('.dir'):
cache_fpath = (dvc_cache_dir / hash_suffix[0:2] / hash_suffix[2:])
with open(cache_fpath, 'r') as file:
dir_outs = yaml.safe_load(file)
for o in dir_outs:
o['path'] = tracked_path / o['relpath']
new_outs.extend(dir_outs)
else:
new_outs.append(out)
tracked_files = []
for out in new_outs:
hash_suffix = out['md5']
cache_fpath = (dvc_cache_dir / hash_suffix[0:2] / hash_suffix[2:])
real_fpath = dvc_fpath.parent / out['path']
tracked_files.append({
'real_fpath': real_fpath,
'cache_fpath': cache_fpath,
})
return tracked_files
dvc_fpaths = list_subdir_dvc_fpaths(subdir)
all_tracked = []
for dvc_fpath in ub.ProgIter(dvc_fpaths):
tracked = find_tracked_files(dvc_fpath)
all_tracked.extend(tracked)
to_remove = []
for info in all_tracked:
real = info['real_fpath']
cached = info['cache_fpath']
if real.exists() and cached.exists():
if real.is_symlink():
if real.samefile(cached):
to_remove.append(real)
to_remove.append(cached)
else:
print("Real is not the same as cached")
else:
print("Not a symlink: {}".format(real))
for fpath in to_remove:
fpath.delete()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment