Created
October 2, 2017 09:30
-
-
Save alaniwi/7cc6605723383e27d04c134c2e484c75 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Thin out old snapshots directories, so that after 6 months old, only one | |
per week is kept. | |
""" | |
import os | |
import re | |
import time | |
import shutil | |
from datetime import date, timedelta | |
def get_list(topdir): | |
""" | |
Get a list of entries of the top directory, ignoring everything | |
that doesn't look like 8-digit number (yyyymmdd) | |
Return in order newest to oldest | |
""" | |
dates = filter(re.compile("[0-9]{8}$").match, os.listdir(topdir)) | |
return sorted(dates)[::-1] | |
def date_interval(date1, date2): | |
""" | |
returns days between two dates in string "yyyymmdd" format | |
""" | |
d1 = date(int(date1[0:4]), int(date1[4:6]), int(date1[6:8])) | |
d2 = date(int(date2[0:4]), int(date2[4:6]), int(date2[6:8])) | |
return (d1 - d2).days | |
def date_add_offset(date1, days): | |
d1 = date(int(date1[0:4]), int(date1[4:6]), int(date1[6:8])) | |
d2 = d1 + timedelta(days) | |
return "%04d%02d%02d" % (d1.year, d1.month, d1.day) | |
def date_today(): | |
return time.strftime("%Y%m%d") | |
def get_deletions(dates, thin_after, max_interval): | |
""" | |
Works out which can be deleted | |
Relies on list of dates being in reverse date order. | |
Steps through from newest to oldest, deciding which directories can | |
be deleted, and returns a list. | |
A directory can be deleted if: | |
- it is older than thin_after days, and | |
- it is not the oldest or newest, and | |
- the interval between the two snapshots either side of it (in date | |
order, and excluding any newer snapshot already listed for | |
deletion) does not exceed the max interval | |
""" | |
deletions = [] | |
index = 1 | |
dates = dates[:] # work on a copy | |
today = date_today() | |
while index < len(dates) - 1: | |
age = date_interval(today, dates[index]) | |
if (age > thin_after | |
and date_interval(dates[index - 1], dates[index + 1]) | |
<= max_interval): | |
deletions.append(dates[index]) | |
del(dates[index]) | |
else: | |
index += 1 | |
return deletions | |
def do_deletions(topdir, to_delete): | |
for relpath in to_delete: | |
fullpath = os.path.join(topdir, relpath) | |
#print fullpath | |
shutil.rmtree(fullpath) | |
def thin_dirs(topdir, | |
thin_after=180, | |
max_interval=7): | |
dates = get_list(topdir) | |
to_delete = get_deletions(dates, thin_after, max_interval) | |
do_deletions(topdir, to_delete) | |
if __name__ == '__main__': | |
thin_dirs('/path/to/esgf_snapshots/') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Script to remove some of the older directories of hard links on the ESGF mirror sites, assuming that these have been created using
https://gist.github.com/alaniwi/e9186a8b9897c77b1fcc8adf8d25b6dc
Change path in line 90, and run from cron.