Skip to content

Instantly share code, notes, and snippets.

@alaniwi
Created October 2, 2017 09:30
Show Gist options
  • Save alaniwi/7cc6605723383e27d04c134c2e484c75 to your computer and use it in GitHub Desktop.
Save alaniwi/7cc6605723383e27d04c134c2e484c75 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Thin out old snapshots directories, so that after 6 months old, only one
per week is kept.
"""
import os
import re
import time
import shutil
from datetime import date, timedelta
def get_list(topdir):
"""
Get a list of entries of the top directory, ignoring everything
that doesn't look like 8-digit number (yyyymmdd)
Return in order newest to oldest
"""
dates = filter(re.compile("[0-9]{8}$").match, os.listdir(topdir))
return sorted(dates)[::-1]
def date_interval(date1, date2):
"""
returns days between two dates in string "yyyymmdd" format
"""
d1 = date(int(date1[0:4]), int(date1[4:6]), int(date1[6:8]))
d2 = date(int(date2[0:4]), int(date2[4:6]), int(date2[6:8]))
return (d1 - d2).days
def date_add_offset(date1, days):
d1 = date(int(date1[0:4]), int(date1[4:6]), int(date1[6:8]))
d2 = d1 + timedelta(days)
return "%04d%02d%02d" % (d1.year, d1.month, d1.day)
def date_today():
return time.strftime("%Y%m%d")
def get_deletions(dates, thin_after, max_interval):
"""
Works out which can be deleted
Relies on list of dates being in reverse date order.
Steps through from newest to oldest, deciding which directories can
be deleted, and returns a list.
A directory can be deleted if:
- it is older than thin_after days, and
- it is not the oldest or newest, and
- the interval between the two snapshots either side of it (in date
order, and excluding any newer snapshot already listed for
deletion) does not exceed the max interval
"""
deletions = []
index = 1
dates = dates[:] # work on a copy
today = date_today()
while index < len(dates) - 1:
age = date_interval(today, dates[index])
if (age > thin_after
and date_interval(dates[index - 1], dates[index + 1])
<= max_interval):
deletions.append(dates[index])
del(dates[index])
else:
index += 1
return deletions
def do_deletions(topdir, to_delete):
for relpath in to_delete:
fullpath = os.path.join(topdir, relpath)
#print fullpath
shutil.rmtree(fullpath)
def thin_dirs(topdir,
thin_after=180,
max_interval=7):
dates = get_list(topdir)
to_delete = get_deletions(dates, thin_after, max_interval)
do_deletions(topdir, to_delete)
if __name__ == '__main__':
thin_dirs('/path/to/esgf_snapshots/')
@alaniwi
Copy link
Author

alaniwi commented Oct 2, 2017

Script to remove some of the older directories of hard links on the ESGF mirror sites, assuming that these have been created using
https://gist.github.com/alaniwi/e9186a8b9897c77b1fcc8adf8d25b6dc

Change path in line 90, and run from cron.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment