Skip to content

Instantly share code, notes, and snippets.

@prehensilecode
Last active July 18, 2024 20:56
Show Gist options
  • Save prehensilecode/d9e8b0cadbf0bf6b7feb3eb5ec1a9c22 to your computer and use it in GitHub Desktop.
Save prehensilecode/d9e8b0cadbf0bf6b7feb3eb5ec1a9c22 to your computer and use it in GitHub Desktop.
GPFS snapshotter script
#!/usr/bin/env python3.6
# Author: David Chin david.chin@pennmedicine.upenn.edu
import sys
import os
import shlex
import subprocess
import argparse
import datetime
import calendar
from pathlib import Path
### NOTE Python 3.6.8 is latest Python 3 on DSS nodes
### NOTE GPFS snapshots handle datetime in LOCAL TIME
### NOTE Don't need to use a lock because the snapshot operations will lock themselves.
### e.g. Trying to run mmlssnapshot while mmcrsnapshot is running gives:
### Unable to start tslssnapshot on 'fs001' because conflicting program
### tscrsnapshot is running. Waiting until it completes or moves to the
### next phase, which may allow the current command to start.
### INSTALLATION
### - copy this script to dss001:/usr/local/sbin
### - create cron job: dss001:/etc/cron.d/snapshot_fs001
### */15 * * * * root /usr/local/sbin/snapshotter.py
### EXAMPLE OF SNAPSHOT DELETION for reference
# 15:47:35 root@dss001 ~ # mmlssnapshot fs001
# Snapshots in file system fs001:
# Directory SnapId Status Created ExpirationTime
# bacula-projects 128150 Valid Thu Jul 20 04:45:04 2023 Thu Jul 20 04:45:04 2023
# 2024_06_03-04.30 155000 Valid Mon Jun 3 04:30:08 2024 Mon Jun 3 04:30:08 2024
# post-outage 155001 Valid Mon Jul 8 18:56:06 2024 Mon Jul 8 18:56:06 2024
# post-outage1 155002 Valid Tue Jul 9 11:33:26 2024 Tue Jul 9 11:33:26 2024
# post-outage2 155003 Valid Wed Jul 10 11:33:16 2024 Wed Jul 10 11:33:16 2024
# post-outage2.1 155004 Valid Wed Jul 10 12:55:40 2024 Sat Aug 10 16:55:00 2024
# test_quick_expire 155005 Valid Wed Jul 10 12:56:16 2024 Wed Jul 10 16:57:00 2024
# post-outage2.2 155006 Valid Wed Jul 10 15:24:10 2024 Wed Jul 10 15:40:00 2024
# 15min-20240710T1930 155007 Valid Wed Jul 10 15:30:02 2024 Wed Jul 10 23:30:00 2024
# 15min-20240710T1945 155008 Valid Wed Jul 10 15:45:02 2024 Wed Jul 10 23:45:00 2024
#
# 15:48:27 root@dss001 ~ # mmdelsnapshot fs001 post-outage2.2
# Invalidating snapshot files in :post-outage2.2...
# Deleting files in snapshot :post-outage2.2...
# 100.00 % complete on Wed Jul 10 15:48:39 2024 (1497382912 inodes with total 1446 MB data processed)
# Invalidating snapshot files in :post-outage2.2/F/...
# Delete snapshot :post-outage2.2 successful.
#
# 15:50:59 root@dss001 ~ # mmlssnapshot fs001
# Snapshots in file system fs001:
# Directory SnapId Status Created ExpirationTime
# bacula-projects 128150 Valid Thu Jul 20 04:45:04 2023 Thu Jul 20 04:45:04 2023
# 2024_06_03-04.30 155000 Valid Mon Jun 3 04:30:08 2024 Mon Jun 3 04:30:08 2024
# post-outage 155001 Valid Mon Jul 8 18:56:06 2024 Mon Jul 8 18:56:06 2024
# post-outage1 155002 Valid Tue Jul 9 11:33:26 2024 Tue Jul 9 11:33:26 2024
# post-outage2 155003 Valid Wed Jul 10 11:33:16 2024 Wed Jul 10 11:33:16 2024
# post-outage2.1 155004 Valid Wed Jul 10 12:55:40 2024 Sat Aug 10 16:55:00 2024
# test_quick_expire 155005 Valid Wed Jul 10 12:56:16 2024 Wed Jul 10 16:57:00 2024
# 15min-20240710T1930 155007 Valid Wed Jul 10 15:30:02 2024 Wed Jul 10 23:30:00 2024
# 15min-20240710T1945 155008 Valid Wed Jul 10 15:45:02 2024 Wed Jul 10 23:45:00 2024
_FSNAME='fs001'
# Snapshot retention policy - number of snapshots to retain
# - 15-min snapshots - retain 16
# - 1-hr snapshots - retain 12
# - 1-day snapshots - retain 14
# - 1-week snapshots - retain 8
# - 1-month snapshots - retain 6
# - 1-year snapshots - retain 1
# => total no. of retained snapshots = 57
_POLICY = {'15min': 16, '1hr': 12, '1day': 14, '1wk': 8, '1mth': 6, '1yr': 1}
_DEFAULT_POLICY_FILE = '/usr/local/etc/snapshot_policy.txt'
# Snapshot name format
# - {policystr}-YYYYMMDDTHHMM
# where policystr is in _POLICY.keys()
def read_policy_file_maybe(policyfile=_DEFAULT_POLICY_FILE, debug=False, verbose=False):
"""Read policy file and set global _POLICY"""
global _POLICY
global _DEFAULT_POLICY_FILE
if verbose:
print(f'INFO: snapshotter.py: using policy file {policyfile}')
pf = Path(policyfile)
try:
pf_res = pf.resolve(strict=True)
except FileNotFoundError:
# file does not exist
if verbose:
print(f'INFO: snapshotter.py: no policy file {policyfile}; using defaults')
return
# format of policy file:
# key: value
with open(pf, 'r') as f:
for line in f:
key, val = line.split(':')
_POLICY[key] = int(val)
return
def get_snapshot_list(debug=False, verbose=False):
"""Return a list of dicts of all snapshots"""
global _FSNAME
lssnap = ['/usr/lpp/mmfs/bin/mmlssnapshot', _FSNAME]
rawsnaplist = []
try:
p = subprocess.run(lssnap, stdout=subprocess.PIPE, check=True)
for line in p.stdout.decode('utf-8').split('\n'):
if line:
rawsnaplist.append(line.strip())
except Exception as e:
print(f'EXCEPTION: mmlssnap error: {e}')
sys.exit(1)
# column names
colnames = rawsnaplist[1].split()
# delete first two lines of mmlssnapshot output
del rawsnaplist[:2]
snaplist = [ [*s.split()[:3], ' '.join(s.split()[3:8]), ' '.join(s.split()[8:13])] for s in rawsnaplist ]
if debug:
print(f'DEBUG: snapshotter.py: get_snapshot_list():')
for snap in snaplist:
print(f'DEBUG: snapshotter.py: {snap}')
print()
retval = []
for s in snaplist:
retval.append(dict(zip(colnames, s)))
if debug:
print(f'DEBUG: snapshotter.py: get_snapshot_list():')
for s in retval:
print(f'DEBUG: snapshotter.py: {s}')
print()
return retval
### XXX this is not used/needed
def get_snapshot_sublist(snaplist=None, interval='', debug=False, verbose=False):
"""Given list of all snapshots, return list of snapshots for the given interval"""
if not snaplist:
print('ERROR: get_snapshot_sublist(): invalid snaplist given')
sys.exit(3)
if not interval:
print('ERROR: get_snapshot_sublist(): interval must be specified')
sys.exit(3)
if not interval in _POLICY.keys():
print(f'ERROR: get_snapshot_sublist(): "{interval}" interval not defined')
sys.exit(3)
snapsublist = []
for snap in snaplist:
if snap['Directory'].split('-')[0] == interval:
snapsublist.append(snap)
return snapsublist
def create_snapshots(debug=False, verbose=False):
"""Create snapshots"""
global _FSNAME
global _POLICY
# do this in UTC to avoid duplicate or skip due to daylight saving
# FIXME temporary kluge: GPFS does everything in localtime, and TZ on
# DSS nodes are America/New_York
now = datetime.datetime.now()
dayofweek = datetime.datetime.weekday(now) # Monday == 0
timestamp = f"{now.year}{now.month:02d}{now.day:02d}T{now.hour:02d}{now.minute:02d}"
# mmcrsnapshot specifies expiration time in format yyyy-mm-dd-hh:mm[:ss]
if debug:
print(f'DEBUG: snapshotter.py: create_snapshot(): now = {now}')
print(f'DEBUG: snapshotter.py: create_snapshot(): dayofweek = {dayofweek}')
print(f'DEBUG: snapshotter.py: create_snapshot(): timestamp = {timestamp}')
for interval in _POLICY.keys():
snapname = f'{interval}-{timestamp}'
mmcrsnapshot = ['/usr/lpp/mmfs/bin/mmcrsnapshot', _FSNAME, snapname, '--expiration-time']
if debug:
print(f'DEBUG: snapshotter.py: create_snapshot(): snapname = {snapname}')
if interval == '15min':
dt = datetime.timedelta(minutes=(15 * _POLICY[interval]))
expiration = now + dt
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}'
mmcrsnapshot.append(expstr)
if now.minute % 15 == 0:
if debug:
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}')
else:
if verbose:
print(f'snapshotter.py: create {interval} snap {snapname}')
try:
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True)
if verbose:
print(f'snapshotter.py: mmcrsnapshot output:')
print(p.stdout.decode('utf-8'))
except Exception as e:
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}')
sys.exit(1)
else:
if verbose:
print(f'snapshotter.py: snapshot {snapname} created')
elif interval == '1hr':
dt = datetime.timedelta(hours=(_POLICY[interval]))
expiration = now + dt
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}'
mmcrsnapshot.append(expstr)
if now.minute == 0:
if debug:
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}')
else:
if verbose:
print(f'snapshotter.py: create {interval} snap {snapname}')
try:
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True)
if verbose:
print(f'snapshotter.py: mmcrsnapshot output:')
print(p.stdout.decode('utf-8'))
except Exception as e:
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}')
sys.exit(1)
else:
if verbose:
print(f'snapshotter.py: snapshot {snapname} created')
elif interval == '1day':
dt = datetime.timedelta(days=_POLICY[interval])
expiration = now + dt
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}'
mmcrsnapshot.append(expstr)
if now.hour == 0 and now.minute == 0:
if debug:
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}')
else:
if verbose:
print(f'snapshotter.py: create {interval} snap {snapname}')
try:
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True)
if verbose:
print(f'snapshotter.py: mmcrsnapshot output:')
print(p.stdout.decode('utf-8'))
except Exception as e:
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}')
sys.exit(1)
else:
if verbose:
print(f'snapshotter.py: snapshot {snapname} created')
elif interval == '1wk':
dt = datetime.timedelta(days=(7 * _POLICY[interval]))
expiration = now + dt
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}'
mmcrsnapshot.append(expstr)
if dayofweek == 0 and now.hour == 0 and now.minute == 0:
if debug:
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}')
else:
if verbose:
print(f'snapshotter.py: create {interval} snap {snapname}')
try:
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True)
if verbose:
print(f'snapshotter.py: mmcrsnapshot output:')
print(p.stdout.decode('utf-8'))
except Exception as e:
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}')
sys.exit(1)
else:
if verbose:
print(f'snapshotter.py: snapshot {snapname} created')
elif interval == '1mth':
# figure out expiration date (_POLICY['1mth'] months hence)
exp_year = None
exp_month = None
if (now.month + _POLICY['1mth']) > 12:
exp_year = now.year + 1
exp_month = (now.month + _POLICY['1mth']) % 12
else:
exp_year = now.year
exp_month = now.month + _POLICY['1mth']
expiration = datetime.datetime(exp_year, exp_month, 1)
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}'
mmcrsnapshot.append(expstr)
if now.day == 1 and now.hour == 0 and now.minute == 0:
if debug:
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}')
else:
if verbose:
print(f'snapshotter.py: create {interval} snap {snapname}')
try:
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True)
if verbose:
print(f'snapshotter.py: mmcrsnapshot output:')
print(p.stdout.decode('utf-8'))
except Exception as e:
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}')
sys.exit(1)
else:
if verbose:
print(f'snapshotter.py: snapshot {snapname} created')
elif interval == '1yr':
expiration = datetime.datetime(now.year + 1, now.month, now.day)
expstr = f'{expiration.year}-{expiration.month:02d}-{expiration.day:02d}-{expiration.hour:02d}:{expiration.minute:02d}'
mmcrsnapshot.append(expstr)
if now.month == 1 and now.day == 1 and now.hour == 0 and now.minute == 0:
if debug:
print(f'DEBUG: snapshotter.py: create_snapshot(): {mmcrsnapshot}')
else:
if verbose:
print(f'snapshotter.py: create {interval} snap {snapname}')
try:
p = subprocess.run(mmcrsnapshot, stdout=subprocess.PIPE, check=True)
if verbose:
print(f'snapshotter.py: mmcrsnapshot output:')
print(p.stdout.decode('utf-8'))
except Exception as e:
print(f'EXCEPTION: snapshotter.py: mmcrsnapshot error: {e}')
sys.exit(1)
else:
if verbose:
print(f'snapshotter.py: snapshot {snapname} created')
return
def delete_snapshots_maybe(snaplist, debug=False, verbose=False):
"""Delete snapshots if necessary"""
global _FSNAME
global _POLICY
# Status=DeleteRequired are for snapshots on which
# mmdelsnapshot was run previously but not actually deleted.
# Need to parse expiration dates to find snaps to be deleted.
# These only act on snapshots that this script creates, i.e.
# the snapshot name has a prefix of one of the strings in _POLICY
for snap in snaplist:
# FIXME temporary kluge: GPFS does everything in localtime, and TZ on
# DSS nodes are America/New_York
now = datetime.datetime.now()
expiration = datetime.datetime.strptime(snap["ExpirationTime"], "%a %b %d %H:%M:%S %Y")
mmdelsnapshot = ['/usr/lpp/mmfs/bin/mmdelsnapshot', _FSNAME, snap["Directory"]]
if (snap["Status"] == "DeleteRequired") or ((snap["Directory"].split("-")[0] in _POLICY.keys()) and (now > expiration)):
if debug:
# deletion is by the name of the snapshot, which is the "Directory"
print(f'DEBUG: delete_snapshots_maybe(): deleting snapshot {snap["Directory"]} created on {snap["Created"]}')
else:
if verbose:
print(f'snapshotter.py: deleting snapshot {snap["Directory"]} created on {snap["Created"]}')
try:
p = subprocess.run(mmdelsnapshot, stdout=subprocess.PIPE, check=True)
if verbose:
print(f'snapshotter.py: mmdelsnapshot output:')
for line in p.stdout.decode('utf-8'):
print(f' {line}')
except Exception as e:
print(f'EXCEPTION: snapshotter.py: mmdelsnapshot error: {e}')
sys.exit(1)
else:
if verbose:
print(f'snapshotter.py: snapshot {snap["Directory"]} deleted')
return
def main():
"""Create and manage (delete) GPFS snapshots"""
global _FSNAME
global _DEFAULT_POLICY_FILE
parser = argparse.ArgumentParser(
prog='snapshotter.py',
description='Create and manage GPFS snapshots')
parser.add_argument('-d', '--debug', action='store_true',
help='debugging output')
parser.add_argument('-v', '--verbose', action='store_true',
help='verbose output')
parser.add_argument('-p', '--policy-file', default=_DEFAULT_POLICY_FILE,
help='read policy from given file')
args = parser.parse_args()
debug = args.debug
verbose = args.verbose
read_policy_file_maybe(policyfile=args.policy_file, debug=debug, verbose=verbose)
if verbose:
print(f'snapshotter.py: current time = {datetime.datetime.now()}')
print(f'snapshotter.py: current time UTC = {datetime.datetime.utcnow()}')
print(f'snapshotter.py: POLICY = {_POLICY}')
snaplist = get_snapshot_list(debug=debug, verbose=verbose)
create_snapshots(debug=debug, verbose=verbose)
delete_snapshots_maybe(snaplist, debug=debug, verbose=verbose)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment