Last active
July 18, 2024 20:56
-
-
Save prehensilecode/d9e8b0cadbf0bf6b7feb3eb5ec1a9c22 to your computer and use it in GitHub Desktop.
GPFS snapshotter script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.6 | |
# Author: David Chin david.chin@pennmedicine.upenn.edu | |
import sys | |
import os | |
import shlex | |
import subprocess | |
import argparse | |
import datetime | |
import calendar | |
from pathlib import Path | |
### NOTE Python 3.6.8 is latest Python 3 on DSS nodes | |
### NOTE GPFS snapshots handle datetime in LOCAL TIME | |
### NOTE Don't need to use a lock because the snapshot operations will lock themselves. | |
### e.g. Trying to run mmlssnapshot while mmcrsnapshot is running gives: | |
### Unable to start tslssnapshot on 'fs001' because conflicting program | |
### tscrsnapshot is running. Waiting until it completes or moves to the | |
### next phase, which may allow the current command to start. | |
### INSTALLATION | |
### - copy this script to dss001:/usr/local/sbin | |
### - create cron job: dss001:/etc/cron.d/snapshot_fs001 | |
### */15 * * * * root /usr/local/sbin/snapshotter.py | |
### EXAMPLE OF SNAPSHOT DELETION for reference | |
# 15:47:35 root@dss001 ~ # mmlssnapshot fs001 | |
# Snapshots in file system fs001: | |
# Directory SnapId Status Created ExpirationTime | |
# bacula-projects 128150 Valid Thu Jul 20 04:45:04 2023 Thu Jul 20 04:45:04 2023 | |
# 2024_06_03-04.30 155000 Valid Mon Jun 3 04:30:08 2024 Mon Jun 3 04:30:08 2024 | |
# post-outage 155001 Valid Mon Jul 8 18:56:06 2024 Mon Jul 8 18:56:06 2024 | |
# post-outage1 155002 Valid Tue Jul 9 11:33:26 2024 Tue Jul 9 11:33:26 2024 | |
# post-outage2 155003 Valid Wed Jul 10 11:33:16 2024 Wed Jul 10 11:33:16 2024 | |
# post-outage2.1 155004 Valid Wed Jul 10 12:55:40 2024 Sat Aug 10 16:55:00 2024 | |
# test_quick_expire 155005 Valid Wed Jul 10 12:56:16 2024 Wed Jul 10 16:57:00 2024 | |
# post-outage2.2 155006 Valid Wed Jul 10 15:24:10 2024 Wed Jul 10 15:40:00 2024 | |
# 15min-20240710T1930 155007 Valid Wed Jul 10 15:30:02 2024 Wed Jul 10 23:30:00 2024 | |
# 15min-20240710T1945 155008 Valid Wed Jul 10 15:45:02 2024 Wed Jul 10 23:45:00 2024 | |
# | |
# 15:48:27 root@dss001 ~ # mmdelsnapshot fs001 post-outage2.2 | |
# Invalidating snapshot files in :post-outage2.2... | |
# Deleting files in snapshot :post-outage2.2... | |
# 100.00 % complete on Wed Jul 10 15:48:39 2024 (1497382912 inodes with total 1446 MB data processed) | |
# Invalidating snapshot files in :post-outage2.2/F/... | |
# Delete snapshot :post-outage2.2 successful. | |
# | |
# 15:50:59 root@dss001 ~ # mmlssnapshot fs001 | |
# Snapshots in file system fs001: | |
# Directory SnapId Status Created ExpirationTime | |
# bacula-projects 128150 Valid Thu Jul 20 04:45:04 2023 Thu Jul 20 04:45:04 2023 | |
# 2024_06_03-04.30 155000 Valid Mon Jun 3 04:30:08 2024 Mon Jun 3 04:30:08 2024 | |
# post-outage 155001 Valid Mon Jul 8 18:56:06 2024 Mon Jul 8 18:56:06 2024 | |
# post-outage1 155002 Valid Tue Jul 9 11:33:26 2024 Tue Jul 9 11:33:26 2024 | |
# post-outage2 155003 Valid Wed Jul 10 11:33:16 2024 Wed Jul 10 11:33:16 2024 | |
# post-outage2.1 155004 Valid Wed Jul 10 12:55:40 2024 Sat Aug 10 16:55:00 2024 | |
# test_quick_expire 155005 Valid Wed Jul 10 12:56:16 2024 Wed Jul 10 16:57:00 2024 | |
# 15min-20240710T1930 155007 Valid Wed Jul 10 15:30:02 2024 Wed Jul 10 23:30:00 2024 | |
# 15min-20240710T1945 155008 Valid Wed Jul 10 15:45:02 2024 Wed Jul 10 23:45:00 2024 | |
_FSNAME='fs001' | |
# Snapshot retention policy - number of snapshots to retain | |
# - 15-min snapshots - retain 16 | |
# - 1-hr snapshots - retain 12 | |
# - 1-day snapshots - retain 14 | |
# - 1-week snapshots - retain 8 | |
# - 1-month snapshots - retain 6 | |
# - 1-year snapshots - retain 1 | |
# => total no. of retained snapshots = 57 | |
_POLICY = {'15min': 16, '1hr': 12, '1day': 14, '1wk': 8, '1mth': 6, '1yr': 1} | |
_DEFAULT_POLICY_FILE = '/usr/local/etc/snapshot_policy.txt' | |
# Snapshot name format | |
# - {policystr}-YYYYMMDDTHHMM | |
# where policystr is in _POLICY.keys() | |
def read_policy_file_maybe(policyfile=_DEFAULT_POLICY_FILE, debug=False, verbose=False): | |
"""Read policy file and set global _POLICY""" | |
global _POLICY | |
global _DEFAULT_POLICY_FILE | |
if verbose: | |
print(f'INFO: snapshotter.py: using policy file {policyfile}') | |
pf = Path(policyfile) | |
try: | |
pf_res = pf.resolve(strict=True) | |
except FileNotFoundError: | |
# file does not exist | |
if verbose: | |
print(f'INFO: snapshotter.py: no policy file {policyfile}; using defaults') | |
return | |
# format of policy file: | |
# key: value | |
with open(pf, 'r') as f: | |
for line in f: | |
key, val = line.split(':') | |
_POLICY[key] = int(val) | |
return | |
def get_snapshot_list(debug=False, verbose=False): | |
"""Return a list of dicts of all snapshots""" | |
global _FSNAME | |
lssnap = ['/usr/lpp/mmfs/bin/mmlssnapshot', _FSNAME] | |
rawsnaplist = [] | |
try: | |
p = subprocess.run(lssnap, stdout=subprocess.PIPE, check=True) | |
for line in p.stdout.decode('utf-8').split('\n'): | |
if line: | |
rawsnaplist.append(line.strip()) | |
except Exception as e: | |
print(f'EXCEPTION: mmlssnap error: {e}') | |
sys.exit(1) | |
# column names | |
colnames = rawsnaplist[1].split() | |
# delete first two lines of mmlssnapshot output | |
del rawsnaplist[:2] | |
snaplist = [ [*s.split()[:3], ' '.join(s.split()[3:8]), ' '.join(s.split()[8:13])] for s in rawsnaplist ] | |
if debug: | |
print(f'DEBUG: snapshotter.py: get_snapshot_list():') | |
for snap in snaplist: | |
print(f'DEBUG: snapshotter.py: {snap}') | |
print() | |
retval = [] | |
for s in snaplist: | |
retval.append(dict(zip(colnames, s))) | |
if debug: | |
print(f'DEBUG: snapshotter.py: get_snapshot_list():') | |
for s in retval: | |
print(f'DEBUG: snapshotter.py: {s}') | |
print() | |
return retval | |
### XXX this is not used/needed | |
def get_snapshot_sublist(snaplist=None, interval='', debug=False, verbose=False): | |
"""Given list of all snapshots, return list of snapshots for the given interval""" | |
if not snaplist: | |
print('ERROR: get_snapshot_sublist(): invalid snaplist given') | |
sys.exit(3) | |
if not interval: | |
print('ERROR: get_snapshot_sublist(): interval must be specified') | |
sys.exit(3) | |
if not interval in _POLICY.keys(): | |
print(f'ERROR: get_snapshot_sublist(): "{interval}" interval not defined') | |
sys.exit(3) | |
snapsublist = [] | |
for snap in snaplist: | |
if snap['Directory'].split('-')[0] == interval: | |
snapsublist.append(snap) | |
return snapsublist | |
def create_snapshots(debug=False, verbose=False): | |
"""Create snapshots""" | |
global _FSNAME | |
global _POLICY | |
# do this in UTC to avoid duplicate or skip due to daylight saving | |
# FIXME temporary kluge: GPFS does everything in localtime, and TZ on | |
# DSS nodes are America/New_York | |
now = datetime.datetime.now() | |
dayofweek = datetime.datetime.weekday(now) # Monday == 0 | |
timestamp = f"{now.year}{now.month:02d}{now.day:02d}T{now.hour:02d}{now.minute:02d}" | |
# mmcrsnapshot specifies expiration time in format yyyy-mm-dd-hh:mm[:ss] | |
if debug: | |
print(f'DEBUG: snapshotter.py: create_snapshot(): now = {now}') | |
print(f'DEBUG: snapshotter.py: create_snapshot(): dayofweek = {dayofweek}') | |
print(f'DEBUG: snapshotter.py: create_snapshot(): timestamp = {timestamp}') | |
for interval in _POLICY.keys(): | |
snapname = f'{interval}-{timestamp}' | |
mmcrsnapshot = ['/usr/lpp/mmfs/bin/mmcrsnapshot', _FSNAME, snapname, '--expiration-time'] | |
if debug: | |
print(f'DEBUG: snapshotter.py: create_snapshot(): snapname = {snapname}') | |
if interval == '15min': | |
dt = datetime.timedelta(minutes=(15 * _POLICY[interval])) | |
expiration = now + dt | |
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}' | |
mmcrsnapshot.append(expstr) | |
if now.minute % 15 == 0: | |
if debug: | |
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}') | |
else: | |
if verbose: | |
print(f'snapshotter.py: create {interval} snap {snapname}') | |
try: | |
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True) | |
if verbose: | |
print(f'snapshotter.py: mmcrsnapshot output:') | |
print(p.stdout.decode('utf-8')) | |
except Exception as e: | |
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}') | |
sys.exit(1) | |
else: | |
if verbose: | |
print(f'snapshotter.py: snapshot {snapname} created') | |
elif interval == '1hr': | |
dt = datetime.timedelta(hours=(_POLICY[interval])) | |
expiration = now + dt | |
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}' | |
mmcrsnapshot.append(expstr) | |
if now.minute == 0: | |
if debug: | |
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}') | |
else: | |
if verbose: | |
print(f'snapshotter.py: create {interval} snap {snapname}') | |
try: | |
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True) | |
if verbose: | |
print(f'snapshotter.py: mmcrsnapshot output:') | |
print(p.stdout.decode('utf-8')) | |
except Exception as e: | |
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}') | |
sys.exit(1) | |
else: | |
if verbose: | |
print(f'snapshotter.py: snapshot {snapname} created') | |
elif interval == '1day': | |
dt = datetime.timedelta(days=_POLICY[interval]) | |
expiration = now + dt | |
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}' | |
mmcrsnapshot.append(expstr) | |
if now.hour == 0 and now.minute == 0: | |
if debug: | |
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}') | |
else: | |
if verbose: | |
print(f'snapshotter.py: create {interval} snap {snapname}') | |
try: | |
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True) | |
if verbose: | |
print(f'snapshotter.py: mmcrsnapshot output:') | |
print(p.stdout.decode('utf-8')) | |
except Exception as e: | |
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}') | |
sys.exit(1) | |
else: | |
if verbose: | |
print(f'snapshotter.py: snapshot {snapname} created') | |
elif interval == '1wk': | |
dt = datetime.timedelta(days=(7 * _POLICY[interval])) | |
expiration = now + dt | |
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}' | |
mmcrsnapshot.append(expstr) | |
if dayofweek == 0 and now.hour == 0 and now.minute == 0: | |
if debug: | |
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}') | |
else: | |
if verbose: | |
print(f'snapshotter.py: create {interval} snap {snapname}') | |
try: | |
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True) | |
if verbose: | |
print(f'snapshotter.py: mmcrsnapshot output:') | |
print(p.stdout.decode('utf-8')) | |
except Exception as e: | |
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}') | |
sys.exit(1) | |
else: | |
if verbose: | |
print(f'snapshotter.py: snapshot {snapname} created') | |
elif interval == '1mth': | |
# figure out expiration date (_POLICY['1mth'] months hence) | |
exp_year = None | |
exp_month = None | |
if (now.month + _POLICY['1mth']) > 12: | |
exp_year = now.year + 1 | |
exp_month = (now.month + _POLICY['1mth']) % 12 | |
else: | |
exp_year = now.year | |
exp_month = now.month + _POLICY['1mth'] | |
expiration = datetime.datetime(exp_year, exp_month, 1) | |
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}' | |
mmcrsnapshot.append(expstr) | |
if now.day == 1 and now.hour == 0 and now.minute == 0: | |
if debug: | |
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}') | |
else: | |
if verbose: | |
print(f'snapshotter.py: create {interval} snap {snapname}') | |
try: | |
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True) | |
if verbose: | |
print(f'snapshotter.py: mmcrsnapshot output:') | |
print(p.stdout.decode('utf-8')) | |
except Exception as e: | |
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}') | |
sys.exit(1) | |
else: | |
if verbose: | |
print(f'snapshotter.py: snapshot {snapname} created') | |
elif interval == '1yr': | |
expiration = datetime.datetime(now.year + 1, now.month, now.day) | |
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}' | |
mmcrsnapshot.append(expstr) | |
if now.month == 1 and now.day == 1 and now.hour == 0 and now.minute == 0: | |
if debug: | |
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}') | |
else: | |
if verbose: | |
print(f'snapshotter.py: create {interval} snap {snapname}') | |
try: | |
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True) | |
if verbose: | |
print(f'snapshotter.py: mmcrsnapshot output:') | |
print(p.stdout.decode('utf-8')) | |
except Exception as e: | |
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}') | |
sys.exit(1) | |
else: | |
if verbose: | |
print(f'snapshotter.py: snapshot {snapname} created') | |
return | |
def delete_snapshots_maybe(snaplist, debug=False, verbose=False): | |
"""Delete snapshots if necessary""" | |
global _FSNAME | |
global _POLICY | |
# Status=DeleteRequired are for snapshots on which | |
# mmdelsnapshot was run previously but not actually deleted. | |
# Need to parse expiration dates to find snaps to be deleted. | |
# These only act on snapshots that this script creates, i.e. | |
# the snapshot name has a prefix of one of the strings in _POLICY | |
for snap in snaplist: | |
# FIXME temporary kluge: GPFS does everything in localtime, and TZ on | |
# DSS nodes are America/New_York | |
now = datetime.datetime.now() | |
expiration = datetime.datetime.strptime(snap["ExpirationTime"], "%a %b %d %H:%M:%S %Y") | |
mmdelsnapshot = ['/usr/lpp/mmfs/bin/mmdelsnapshot', _FSNAME, snap["Directory"]] | |
if (snap["Status"] == "DeleteRequired") or ((snap["Directory"].split("-")[0] in _POLICY.keys()) and (now > expiration)): | |
if debug: | |
# deletion is by the name of the snapshot, which is the "Directory" | |
print(f'DEBUG: delete_snapshots_maybe(): deleting snapshot {snap["Directory"]} created on {snap["Created"]}') | |
else: | |
if verbose: | |
print(f'snapshotter.py: deleting snapshot {snap["Directory"]} created on {snap["Created"]}') | |
try: | |
p = subprocess.run(mmdelsnapshot, stdout=subprocess.PIPE, check=True) | |
if verbose: | |
print(f'snapshotter.py: mmdelsnapshot output:') | |
for line in p.stdout.decode('utf-8'): | |
print(f' {line}') | |
except Exception as e: | |
print(f'EXCEPTION: snapshotter.py: mmdelsnapshot error: {e}') | |
sys.exit(1) | |
else: | |
if verbose: | |
print(f'snapshotter.py: snapshot {snap["Directory"]} deleted') | |
return | |
def main(): | |
"""Create and manage (delete) GPFS snapshots""" | |
global _FSNAME | |
global _DEFAULT_POLICY_FILE | |
parser = argparse.ArgumentParser( | |
prog='snapshotter.py', | |
description='Create and manage GPFS snapshots') | |
parser.add_argument('-d', '--debug', action='store_true', | |
help='debugging output') | |
parser.add_argument('-v', '--verbose', action='store_true', | |
help='verbose output') | |
parser.add_argument('-p', '--policy-file', default=_DEFAULT_POLICY_FILE, | |
help='read policy from given file') | |
args = parser.parse_args() | |
debug = args.debug | |
verbose = args.verbose | |
read_policy_file_maybe(policyfile=args.policy_file, debug=debug, verbose=verbose) | |
if verbose: | |
print(f'snapshotter.py: current time = {datetime.datetime.now()}') | |
print(f'snapshotter.py: current time UTC = {datetime.datetime.utcnow()}') | |
print(f'snapshotter.py: POLICY = {_POLICY}') | |
snaplist = get_snapshot_list(debug=debug, verbose=verbose) | |
create_snapshots(debug=debug, verbose=verbose) | |
delete_snapshots_maybe(snaplist, debug=debug, verbose=verbose) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment