Skip to content

Instantly share code, notes, and snippets.

@esolitos
Last active November 12, 2018 10:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save esolitos/0a2f9c94bd4a4f86ed1290a908f0b26b to your computer and use it in GitHub Desktop.
Save esolitos/0a2f9c94bd4a4f86ed1290a908f0b26b to your computer and use it in GitHub Desktop.
UNMAINTAINED - See `ramsalt/cron-scripts`
import os, sys, subprocess, re
from datetime import datetime, timedelta
if __name__ == '__main__':
now_date = datetime.today()
date_7days_ago = now_date - timedelta(days=7)
date_4weeks_ago = now_date - timedelta(weeks=4)
bucket_name = 'INSERT_AWS_S3_BUCKET_NAME_HERE'
db_bkps = {}
file_bkps = {}
bkps_in_excess = []
print "\n##### 1. Fetch list of stored backups from S3...\n"
proc = subprocess.Popen(['/usr/local/bin/s3cmd', 'ls', 's3://'+bucket_name+'/'], stdout=subprocess.PIPE)
(out, err) = proc.communicate()
# print out
# print err
## Debug!
# f = open('out.txt', 'w')
# f.write(out)
# f.close()
print "\n##### 2. Match and index all items...\n"
for line in out.split('\n'):
# for line in open('out.txt', 'r').read().split('\n'):
backup = line.split()
try:
re_s3_prefix = '(?:' + re.escape('s3://'+bucket_name+'/') + ')'
re_grp_site = '(?P<site>[a-z0-9\-]+_[a-z]+)'
re_grp_datetime = '(?P<date>(?:[\d\-]+){3}T(?:[\d\-]+){3})'
re_grp_hash = '(?P<hash>[a-z0-9]+)'
re_grp_bktype = '(?P<bktype>files|db)'
b_info_pattern = '^'+re_s3_prefix+re_grp_site+'_'+re_grp_datetime+'_'+re_grp_hash+'_'+re_grp_bktype+'.*(?:\.(tar|sql)(\.gz)?)$'
match = re.match(b_info_pattern, backup[3], flags=re.IGNORECASE)
if match is not None:
# print match.groupdict()
info = match.groupdict()
bkp_type = info['bktype'];
bkp_site = info['site']
bkp_date = datetime.strptime(info['date'], "%Y-%m-%dT%H-%M-%S")
date_cal = bkp_date.isocalendar()
year_sel ='Y' + str(date_cal[0])
week_sel ='W' + str(date_cal[1])
# Skip the last 7 days untouched.
if bkp_date > date_7days_ago:
# print "Keep ", bkp_type, " from ", bkp_date, " for: ", bkp_site
continue
# Simple Logic for FILE backups
if bkp_type == 'files':
# Default value, take the first available
if bkp_site not in file_bkps:
file_bkps[bkp_site] = {'info': info, 'date': bkp_date, 'bkp': backup}
isOlderThanFourWeeks = bkp_date < date_4weeks_ago
isNewerThanSelected = bkp_date > file_bkps[bkp_site]['date']
if isNewerThanSelected:
# Remove current selected if it is not the newest and it's from more than 4 weeks ago
if file_bkps[bkp_site]['date'] < date_4weeks_ago:
bkps_in_excess.append(file_bkps[bkp_site])
file_bkps[bkp_site] = {'info': info, 'date': bkp_date, 'bkp': backup}
elif isOlderThanFourWeeks:
# print "Select to remove FILES backups older than 4 weeks. Site: ", bkp_site, ' Date: ', bkp_date
bkps_in_excess.append({'info': info, 'date': bkp_date, 'bkp': backup})
elif bkp_type == 'db':
# initialize SITE map
if bkp_site not in db_bkps:
db_bkps[bkp_site] = {year_sel: {}}
# initialize SITE][YEAR map
if year_sel not in db_bkps[bkp_site]:
db_bkps[bkp_site][year_sel] = {}
if week_sel not in db_bkps[bkp_site][year_sel]:
db_bkps[bkp_site][year_sel][week_sel] = {'info': info, 'date': bkp_date, 'year': year_sel, 'week': week_sel, 'bkp': backup}
elif db_bkps[bkp_site][year_sel][week_sel]['date'] < bkp_date:
# Mark this as extra backup to remove
bkps_in_excess.append(db_bkps[bkp_site][year_sel][week_sel])
# as we only keep the most recent backup per week
db_bkps[bkp_site][year_sel][week_sel] = {'info': info, 'date': bkp_date, 'year': year_sel, 'week': week_sel, 'bkp': backup}
else:
# Only keep 1 (latest) backup per week
bkps_in_excess.append({'info': info, 'year': year_sel, 'week': week_sel, 'bkp': backup})
else:
print 'Unknown backup type: ', bkp_type
else:
print 'No matching groups:', line
except IndexError:
print 'Skip line: ', line
print "\n##### 3. Remove extra backups...\n"
cur_instance = None
cur_type = None
chosen_bkp_date = None
for bkp_data in bkps_in_excess:
if (cur_instance is None or cur_instance != bkp_data['info']['site']):
cur_type = None
cur_instance = bkp_data['info']['site']
print "\n# Instance:", cur_instance
if (cur_type is None or cur_type != bkp_data['info']['bktype']):
cur_type = bkp_data['info']['bktype']
print '## Type: ', cur_type
if cur_type == 'files':
print '#'
print '+ Most recent: ', file_bkps[cur_instance]['date']
# print '+ Keeping: ', file_bkps[cur_instance]['date']
bkp_name = bkp_data['bkp'][3]
if cur_type == 'db':
_bkp_date_selection = db_bkps[ cur_instance ][ bkp_data['year'] ][ bkp_data['week'] ]['date']
if (chosen_bkp_date is None or chosen_bkp_date != _bkp_date_selection):
chosen_bkp_date = _bkp_date_selection
print '#'
print '+ Keeping weekly: ', chosen_bkp_date
print '- Remove: ', bkp_data['info']['date']
# Remove from S3
proc = subprocess.Popen(['/usr/local/bin/s3cmd', 'del', bkp_name], stdout=subprocess.PIPE)
(out, err) = proc.communicate()
# print out
if (err is not None):
print "Error encountered:"
print err
print "##### X. ...done!\n"
@esolitos
Copy link
Author

Rev2: Allow also numbers in the site name matching.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment