esolitos/wodby-s3-curate.py

## wodby-s3-curate.py
import os, sys, subprocess, re
from datetime import datetime, timedelta

if __name__ == '__main__':

    now_date = datetime.today()
    date_7days_ago = now_date - timedelta(days=7)
    date_4weeks_ago = now_date - timedelta(weeks=4)

    bucket_name = 'INSERT_AWS_S3_BUCKET_NAME_HERE'
    db_bkps = {}
    file_bkps = {}
    bkps_in_excess = []

    print "\n#####   1.  Fetch list of stored backups from S3...\n"

    proc = subprocess.Popen(['/usr/local/bin/s3cmd', 'ls', 's3://'+bucket_name+'/'], stdout=subprocess.PIPE)
    (out, err) = proc.communicate()
    # print out
    # print err

    ## Debug!
    # f = open('out.txt', 'w')
    # f.write(out)
    # f.close()

    print "\n#####   2.  Match and index all items...\n"

    for line in out.split('\n'):
    # for line in open('out.txt', 'r').read().split('\n'):
        backup = line.split()

        try:
            re_s3_prefix = '(?:' + re.escape('s3://'+bucket_name+'/') + ')'
            re_grp_site = '(?P<site>[a-z0-9\-]+_[a-z]+)'
            re_grp_datetime = '(?P<date>(?:[\d\-]+){3}T(?:[\d\-]+){3})'
            re_grp_hash = '(?P<hash>[a-z0-9]+)'
            re_grp_bktype = '(?P<bktype>files|db)'

            b_info_pattern = '^'+re_s3_prefix+re_grp_site+'_'+re_grp_datetime+'_'+re_grp_hash+'_'+re_grp_bktype+'.*(?:\.(tar|sql)(\.gz)?)$'

            match = re.match(b_info_pattern, backup[3], flags=re.IGNORECASE)
            if match is not None:
                # print match.groupdict()
                info = match.groupdict()
                bkp_type = info['bktype'];
                bkp_site = info['site']
                bkp_date = datetime.strptime(info['date'], "%Y-%m-%dT%H-%M-%S")
                date_cal = bkp_date.isocalendar()
                year_sel ='Y' + str(date_cal[0])
                week_sel ='W' + str(date_cal[1])

                # Skip the last 7 days untouched.
                if bkp_date > date_7days_ago:
                    # print "Keep ", bkp_type, " from ", bkp_date, " for: ", bkp_site
                    continue

                # Simple Logic for FILE backups
                if bkp_type == 'files':
                    # Default value, take the first available
                    if bkp_site not in file_bkps:
                        file_bkps[bkp_site] = {'info': info, 'date': bkp_date, 'bkp': backup}

                    isOlderThanFourWeeks = bkp_date < date_4weeks_ago
                    isNewerThanSelected = bkp_date > file_bkps[bkp_site]['date']

                    if isNewerThanSelected:
                        # Remove current selected if it is not the newest and it's from more than 4 weeks ago
                        if file_bkps[bkp_site]['date'] < date_4weeks_ago:
                            bkps_in_excess.append(file_bkps[bkp_site])

                        file_bkps[bkp_site] = {'info': info, 'date': bkp_date, 'bkp': backup}

                    elif isOlderThanFourWeeks:
                        # print "Select to remove FILES backups older than 4 weeks. Site: ", bkp_site, ' Date: ', bkp_date
                        bkps_in_excess.append({'info': info, 'date': bkp_date, 'bkp': backup})

                elif bkp_type == 'db':
                    # initialize SITE map
                    if bkp_site not in db_bkps:
                        db_bkps[bkp_site] = {year_sel: {}}
                    # initialize SITE][YEAR map
                    if year_sel not in db_bkps[bkp_site]:
                        db_bkps[bkp_site][year_sel] = {}

                    if week_sel not in db_bkps[bkp_site][year_sel]:
                        db_bkps[bkp_site][year_sel][week_sel] = {'info': info, 'date': bkp_date, 'year': year_sel, 'week': week_sel, 'bkp': backup}

                    elif db_bkps[bkp_site][year_sel][week_sel]['date'] < bkp_date:
                        # Mark this as extra backup to remove
                        bkps_in_excess.append(db_bkps[bkp_site][year_sel][week_sel])
                        # as we only keep the most recent backup per week
                        db_bkps[bkp_site][year_sel][week_sel] = {'info': info, 'date': bkp_date, 'year': year_sel, 'week': week_sel, 'bkp': backup}

                    else:
                        # Only keep 1 (latest) backup per week
                        bkps_in_excess.append({'info': info, 'year': year_sel, 'week': week_sel, 'bkp': backup})

                else:
                    print 'Unknown backup type: ', bkp_type

            else:
                print 'No matching groups:', line

        except IndexError:
            print 'Skip line: ', line


    print "\n#####   3.  Remove extra backups...\n"

    cur_instance = None
    cur_type = None
    chosen_bkp_date = None

    for bkp_data in bkps_in_excess:
        if (cur_instance is None or cur_instance != bkp_data['info']['site']):
            cur_type = None
            cur_instance = bkp_data['info']['site']
            print "\n# Instance:", cur_instance

        if (cur_type is None or cur_type != bkp_data['info']['bktype']):
            cur_type = bkp_data['info']['bktype']
            print '## Type: ', cur_type

            if cur_type == 'files':
                print '#'
                print '+ Most recent:    ', file_bkps[cur_instance]['date']
                # print '+ Keeping:        ', file_bkps[cur_instance]['date']

        bkp_name = bkp_data['bkp'][3]

        if cur_type == 'db':
            _bkp_date_selection = db_bkps[ cur_instance ][ bkp_data['year'] ][ bkp_data['week'] ]['date']

            if (chosen_bkp_date is None or chosen_bkp_date != _bkp_date_selection):
                chosen_bkp_date = _bkp_date_selection
                print '#'
                print '+ Keeping weekly: ', chosen_bkp_date

        print '- Remove:         ', bkp_data['info']['date']

        # Remove from S3
        proc = subprocess.Popen(['/usr/local/bin/s3cmd', 'del', bkp_name], stdout=subprocess.PIPE)
        (out, err) = proc.communicate()

        # print out
        if (err is not None):
            print "Error encountered:"
            print err

    print "#####   X.  ...done!\n"
	import os, sys, subprocess, re
	from datetime import datetime, timedelta

	if __name__ == '__main__':

	now_date = datetime.today()
	date_7days_ago = now_date - timedelta(days=7)
	date_4weeks_ago = now_date - timedelta(weeks=4)

	bucket_name = 'INSERT_AWS_S3_BUCKET_NAME_HERE'
	db_bkps = {}
	file_bkps = {}
	bkps_in_excess = []

	print "\n##### 1. Fetch list of stored backups from S3...\n"

	proc = subprocess.Popen(['/usr/local/bin/s3cmd', 'ls', 's3://'+bucket_name+'/'], stdout=subprocess.PIPE)
	(out, err) = proc.communicate()
	# print out
	# print err

	## Debug!
	# f = open('out.txt', 'w')
	# f.write(out)
	# f.close()

	print "\n##### 2. Match and index all items...\n"

	for line in out.split('\n'):
	# for line in open('out.txt', 'r').read().split('\n'):
	backup = line.split()

	try:
	re_s3_prefix = '(?:' + re.escape('s3://'+bucket_name+'/') + ')'
	re_grp_site = '(?P<site>[a-z0-9\-]+_[a-z]+)'
	re_grp_datetime = '(?P<date>(?:[\d\-]+){3}T(?:[\d\-]+){3})'
	re_grp_hash = '(?P<hash>[a-z0-9]+)'
	re_grp_bktype = '(?P<bktype>files\|db)'

	b_info_pattern = '^'+re_s3_prefix+re_grp_site+'_'+re_grp_datetime+'_'+re_grp_hash+'_'+re_grp_bktype+'.*(?:\.(tar\|sql)(\.gz)?)$'

	match = re.match(b_info_pattern, backup[3], flags=re.IGNORECASE)
	if match is not None:
	# print match.groupdict()
	info = match.groupdict()
	bkp_type = info['bktype'];
	bkp_site = info['site']
	bkp_date = datetime.strptime(info['date'], "%Y-%m-%dT%H-%M-%S")
	date_cal = bkp_date.isocalendar()
	year_sel ='Y' + str(date_cal[0])
	week_sel ='W' + str(date_cal[1])

	# Skip the last 7 days untouched.
	if bkp_date > date_7days_ago:
	# print "Keep ", bkp_type, " from ", bkp_date, " for: ", bkp_site
	continue

	# Simple Logic for FILE backups
	if bkp_type == 'files':
	# Default value, take the first available
	if bkp_site not in file_bkps:
	file_bkps[bkp_site] = {'info': info, 'date': bkp_date, 'bkp': backup}

	isOlderThanFourWeeks = bkp_date < date_4weeks_ago
	isNewerThanSelected = bkp_date > file_bkps[bkp_site]['date']

	if isNewerThanSelected:
	# Remove current selected if it is not the newest and it's from more than 4 weeks ago
	if file_bkps[bkp_site]['date'] < date_4weeks_ago:
	bkps_in_excess.append(file_bkps[bkp_site])

	file_bkps[bkp_site] = {'info': info, 'date': bkp_date, 'bkp': backup}

	elif isOlderThanFourWeeks:
	# print "Select to remove FILES backups older than 4 weeks. Site: ", bkp_site, ' Date: ', bkp_date
	bkps_in_excess.append({'info': info, 'date': bkp_date, 'bkp': backup})

	elif bkp_type == 'db':
	# initialize SITE map
	if bkp_site not in db_bkps:
	db_bkps[bkp_site] = {year_sel: {}}
	# initialize SITE][YEAR map
	if year_sel not in db_bkps[bkp_site]:
	db_bkps[bkp_site][year_sel] = {}

	if week_sel not in db_bkps[bkp_site][year_sel]:
	db_bkps[bkp_site][year_sel][week_sel] = {'info': info, 'date': bkp_date, 'year': year_sel, 'week': week_sel, 'bkp': backup}

	elif db_bkps[bkp_site][year_sel][week_sel]['date'] < bkp_date:
	# Mark this as extra backup to remove
	bkps_in_excess.append(db_bkps[bkp_site][year_sel][week_sel])
	# as we only keep the most recent backup per week
	db_bkps[bkp_site][year_sel][week_sel] = {'info': info, 'date': bkp_date, 'year': year_sel, 'week': week_sel, 'bkp': backup}

	else:
	# Only keep 1 (latest) backup per week
	bkps_in_excess.append({'info': info, 'year': year_sel, 'week': week_sel, 'bkp': backup})

	else:
	print 'Unknown backup type: ', bkp_type

	else:
	print 'No matching groups:', line

	except IndexError:
	print 'Skip line: ', line


	print "\n##### 3. Remove extra backups...\n"

	cur_instance = None
	cur_type = None
	chosen_bkp_date = None

	for bkp_data in bkps_in_excess:
	if (cur_instance is None or cur_instance != bkp_data['info']['site']):
	cur_type = None
	cur_instance = bkp_data['info']['site']
	print "\n# Instance:", cur_instance

	if (cur_type is None or cur_type != bkp_data['info']['bktype']):
	cur_type = bkp_data['info']['bktype']
	print '## Type: ', cur_type

	if cur_type == 'files':
	print '#'
	print '+ Most recent: ', file_bkps[cur_instance]['date']
	# print '+ Keeping: ', file_bkps[cur_instance]['date']

	bkp_name = bkp_data['bkp'][3]

	if cur_type == 'db':
	_bkp_date_selection = db_bkps[ cur_instance ][ bkp_data['year'] ][ bkp_data['week'] ]['date']

	if (chosen_bkp_date is None or chosen_bkp_date != _bkp_date_selection):
	chosen_bkp_date = _bkp_date_selection
	print '#'
	print '+ Keeping weekly: ', chosen_bkp_date

	print '- Remove: ', bkp_data['info']['date']

	# Remove from S3
	proc = subprocess.Popen(['/usr/local/bin/s3cmd', 'del', bkp_name], stdout=subprocess.PIPE)
	(out, err) = proc.communicate()

	# print out
	if (err is not None):
	print "Error encountered:"
	print err

	print "##### X. ...done!\n"