Skip to content

Instantly share code, notes, and snippets.

@Andersson007
Created July 31, 2018 07:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Andersson007/f781b7c097645d051ffa6413606d0669 to your computer and use it in GitHub Desktop.
Save Andersson007/f781b7c097645d051ffa6413606d0669 to your computer and use it in GitHub Desktop.
The script checks postgres dump and sends mail notification about status and statistic
#!/usr/bin/python
# pgdump_check.py - checks postgres dump and
# sends mail notification about status and statistic
#
# Author: Andrey Klychkov aaklychkov@mail.ru
# Version: 1.0
# Date: 2018-07-31
# Licence: Copyleft free software
# Requirements:
# * Python 2.6;
# * dumps in the Directory format, sorted by timestamp in names,
# for example, dump dir names may have the "YYYY-MM-DD_testdb" form
#
# This script:
# 0) creates its own log file
# 1) checks hostname and postgres process, it must be running
# 2) checks RECOVER_DB in the cluster, if it exists, exit
# 3) finds the last dump
# 4) runs pg_restore
# 5) checks the postgres log for errors
# 6) sends stat report about execution time and cluster size
# after restoring
# 7) drop the recovered database
# 8) remove own log files
#
# Notification output example:
# ----------------------------
# mysrv01: restore testdb /mnt/backups/edi/2018-07-28_testdb done:
# exec_time=1 day, 11:11:39.747817
# cluster_size=743GB
import datetime
import os
import shutil
import smtplib
import socket
import subprocess
import sys
import time
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
VERSION = '1.0'
HOSTNAME = socket.gethostname()
NOW = datetime.datetime.now()
F_TIME = NOW.strftime('%Y%m%d')
#######################
# CONFIGURATION BLOCK #
#######################
# Main params:
PGDATA = '/var/lib/pgsql/10/data' # path to PGDATA
RESTORE_HOSTNAME = 'mysrv1' # this hostname, you want to restore pgdump on it
# must be according $HOSTNAME
# to prevent of disaster - removing work database
DUMP_DIR = '/mnt/backups/edi/' # path to dir that contents dumps
DUMP_SUFFIX = '_testdb' # dump suffix
RECOVER_DB = 'testdb' # db name to recover
RECOVER_JOBS = 4 # number of pg_restore jobs
PGLOG_DIR = PGDATA+'/pg_log' # path to postgres log directory
PGLOG_SUFFIX = 'csv' # suffix of postgres logs
# Logging params of this script:
LOG_DIR = '/tmp/'
LOG_PREF = 'pgdump_check.log_'
LOG = LOG_DIR+LOG_PREF+F_TIME
KEEP_FILES = 7 # Number of keeping logs
# Mailing params:
SEND_MAIL = True
SENDER = 'report.mycompany@gmail.com'
RECIPIENT = ['receiver1@mydomain.com']
SMTP_SRV = 'smtp.gmail.com'
SMTP_PORT = 587
SMTP_PASS = 'PassForSender'
def send_mail(sbj, mesg):
if SEND_MAIL is True:
msg = MIMEMultipart()
msg['Subject'] = (sbj)
msg['From'] = 'root@%s' % (HOSTNAME)
msg['To'] = RECIPIENT[0]
body = mesg
msg.attach(MIMEText(body, 'plain'))
smtpconnect = smtplib.SMTP(SMTP_SRV, SMTP_PORT)
smtpconnect.starttls()
smtpconnect.login(SENDER, SMTP_PASS)
smtpconnect.sendmail(SENDER, RECIPIENT, msg.as_string())
smtpconnect.quit()
else:
pass
def now_time():
now = datetime.datetime.now()
f_time = now.strftime('%Y.%m.%d_%H:%M:%S')
return f_time
def log_write(msg, log_file=LOG):
log = open(log_file, 'a')
try:
log.write(now_time()+' '+msg)
finally:
log.close()
def check_dir(dr):
if os.path.isdir(dr) is False:
return False
else:
return True
def exec_command(cmd):
ret = subprocess.Popen(cmd, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
out = ret.stdout.readlines()
ret.communicate()
rcode = ret.poll()
return out, rcode
def get_file_list(regexp, path, db=''):
'''Return a sorted list of filesystem objects
'''
if db:
regexp = '%s%s' % (db, regexp)
file_list = [s for s in os.listdir(path) if regexp in s]
return sorted(file_list)
def remove_old_files(del_list, keep_files):
'''Remove old files if they are more than keep_files
'''
while len(del_list) > keep_files:
if os.path.isfile(LOG_DIR+del_list[0]):
os.remove(LOG_DIR+del_list[0])
if os.path.isdir(LOG_DIR+del_list[0]):
shutil.rmtree(LOG_DIR+del_list[0])
del_list.pop(0)
log_write('removing %s\n' % del_list[0])
def get_size(start_path):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)
return total_size
def main():
# Mail subject template:
mail_sbj = '%s: pgdump_check.py status' % HOSTNAME
# Mark start of job in a logfile:
log_write('==Start pgdump_check.py for database %s==\n' % RECOVER_DB)
# To prevent to drop database on a production cluster,
# RESTORE_HOSTNAME and HOSTNAME must be the same host:
if RESTORE_HOSTNAME != HOSTNAME:
msg = 'You are trying to restore pgdump on %s. '\
'Are you sure that it\'s the right server? exit\n' % HOSTNAME
log_write(msg)
send_mail(mail_sbj, msg)
sys.exit(1)
# Check postgresql processes:
pg_process = 'ps aux | grep postgres | '\
'grep -v "bash\|grep\|su - postgres\|ps aux\|python"'
pg_process_status = exec_command(pg_process)
process_retcode = pg_process_status[1]
if not pg_process or process_retcode != 0:
msg = '%s: postgres are not running or retcode '\
'of check not equal "0", see %s. exit\n' % (HOSTNAME, LOG)
for i in pg_process_status[0]:
log_write(i)
send_mail(mail_sbj, msg)
sys.exit(1)
# } end check
# Check dump directory:
if not check_dir(DUMP_DIR):
msg = '%s: %s does not exist. exit\n' % (HOSTNAME, DUMP_DIR)
log_write(msg)
send_mail(mail_sbj, msg)
sys.exit(1)
# Check RECOVER_DB. If it exists, exit:
db_check = 'psql -t -c "\l %s"' % RECOVER_DB
db_check_status = exec_command(db_check)
db_check_retcode = db_check_status[1]
if db_check_retcode == 0 and db_check_status[0][0] != '\n':
msg = '%s: database %s exists or this check is not possible! '\
'stop recovery, exit\n' % (HOSTNAME, RECOVER_DB)
log_write(msg)
send_mail(mail_sbj, msg)
sys.exit(1)
log_write('database %s does not exist, begin recovery\n' % RECOVER_DB)
# Get the last dump path:
dump_list = get_file_list(DUMP_SUFFIX, DUMP_DIR)
if not dump_list:
msg = '%s: dump with suffix "%s" does not exist. exit\n' % (
HOSTNAME, DUMP_SUFFIX)
log_write(msg)
send_mail(mail_sbj, msg)
sys.exit(1)
else:
last_dump = DUMP_DIR+dump_list[-1]
# Run pg_restore:
start_time = datetime.datetime.now()
dump_size = get_size(last_dump) / 1024 / 1024
log_write('dump_size is %s MB\n' % dump_size)
restore_command = 'pg_restore -F d %s -j %s -C -d postgres' % (
last_dump, RECOVER_JOBS)
restore_status = exec_command(restore_command)
restore_retcode = restore_status[1]
if restore_retcode != 0 or restore_status[0]:
restore_msg = ''.join(restore_status[0])
log_write(restore_msg+'\n')
msg = '%s: restore %s failed' % (HOSTNAME, RECOVER_DB)
log_write(msg+'\n')
send_mail(mail_sbj, msg+', see %s for more info\n' % LOG)
sys.exit(1)
# Check postgres log for errors:
log_list = get_file_list(PGLOG_SUFFIX, PGLOG_DIR)
if not log_list:
msg = '%s: log in %s or with "%s" suffix does not exist\n' % (
HOSTNAME, PGLOG_DIR, PGLOG_SUFFIX)
log_write(msg)
send_mail(mail_sbj, msg)
else:
pg_last_log = PGLOG_DIR+'/'+get_file_list(PGLOG_SUFFIX, PGLOG_DIR)[-1]
grep_error = 'grep "FATAL\|ERROR" %s | '\
'grep -v " autovacuum "' % pg_last_log
grep_error_status = exec_command(grep_error)
grep_retcode = grep_error_status[1]
if grep_retcode == 0:
grep_msg = ''.join(grep_error_status[0])
log_write(grep_msg+'\n')
msg = 'in %s:' % pg_last_log
log_write(msg+'\n')
send_mail(mail_sbj, grep_msg+':\n'+msg)
end_time = datetime.datetime.now()
exec_time = end_time - start_time
# Make and send the stat report:
du = exec_command('du -s %s' % PGDATA)[0]
cluster_size = int(du[0].split()[0]) / 1024 / 1024
msg = '%s: restore %s %s done:\nexec_time=%s\ncluster_size=%sGB\n' % (
HOSTNAME, RECOVER_DB, last_dump, exec_time, cluster_size)
log_write(msg)
send_mail(mail_sbj, msg)
# Check is finished, drop RECOVER_DB:
drop_command = 'psql -t -c "drop database %s;"' % RECOVER_DB
drop_status = exec_command(drop_command)
drop_retcode = drop_status[1]
if drop_retcode != 0:
drop_msg = ''.join(drop_status[0])
log_write(msg+'\n')
msg = 'drop database %s faild\n' % (RECOVER_DB)
log_write(msg)
send_mail(mail_sbj, msg+', see %s for more info\n' % LOG)
sys.exit(1)
# Remove old logs:
remove_old_files(get_file_list(LOG_PREF, LOG_DIR), KEEP_FILES)
log_write('==All jobs are executed==\n')
sys.exit(0)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment