Created
July 31, 2018 07:10
-
-
Save Andersson007/f781b7c097645d051ffa6413606d0669 to your computer and use it in GitHub Desktop.
The script checks postgres dump and sends mail notification about status and statistic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# pgdump_check.py - checks postgres dump and | |
# sends mail notification about status and statistic | |
# | |
# Author: Andrey Klychkov aaklychkov@mail.ru | |
# Version: 1.0 | |
# Date: 2018-07-31 | |
# Licence: Copyleft free software | |
# Requirements: | |
# * Python 2.6; | |
# * dumps in the Directory format, sorted by timestamp in names, | |
# for example, dump dir names may have the "YYYY-MM-DD_testdb" form | |
# | |
# This script: | |
# 0) creates its own log file | |
# 1) checks hostname and postgres process, it must be running | |
# 2) checks RECOVER_DB in the cluster, if it exists, exit | |
# 3) finds the last dump | |
# 4) runs pg_restore | |
# 5) checks the postgres log for errors | |
# 6) sends stat report about execution time and cluster size | |
# after restoring | |
# 7) drop the recovered database | |
# 8) remove own log files | |
# | |
# Notification output example: | |
# ---------------------------- | |
# mysrv01: restore testdb /mnt/backups/edi/2018-07-28_testdb done: | |
# exec_time=1 day, 11:11:39.747817 | |
# cluster_size=743GB | |
import datetime | |
import os | |
import shutil | |
import smtplib | |
import socket | |
import subprocess | |
import sys | |
import time | |
from email.mime.multipart import MIMEMultipart | |
from email.mime.text import MIMEText | |
VERSION = '1.0' | |
HOSTNAME = socket.gethostname() | |
NOW = datetime.datetime.now() | |
F_TIME = NOW.strftime('%Y%m%d') | |
####################### | |
# CONFIGURATION BLOCK # | |
####################### | |
# Main params: | |
PGDATA = '/var/lib/pgsql/10/data' # path to PGDATA | |
RESTORE_HOSTNAME = 'mysrv1' # this hostname, you want to restore pgdump on it | |
# must be according $HOSTNAME | |
# to prevent of disaster - removing work database | |
DUMP_DIR = '/mnt/backups/edi/' # path to dir that contents dumps | |
DUMP_SUFFIX = '_testdb' # dump suffix | |
RECOVER_DB = 'testdb' # db name to recover | |
RECOVER_JOBS = 4 # number of pg_restore jobs | |
PGLOG_DIR = PGDATA+'/pg_log' # path to postgres log directory | |
PGLOG_SUFFIX = 'csv' # suffix of postgres logs | |
# Logging params of this script: | |
LOG_DIR = '/tmp/' | |
LOG_PREF = 'pgdump_check.log_' | |
LOG = LOG_DIR+LOG_PREF+F_TIME | |
KEEP_FILES = 7 # Number of keeping logs | |
# Mailing params: | |
SEND_MAIL = True | |
SENDER = 'report.mycompany@gmail.com' | |
RECIPIENT = ['receiver1@mydomain.com'] | |
SMTP_SRV = 'smtp.gmail.com' | |
SMTP_PORT = 587 | |
SMTP_PASS = 'PassForSender' | |
def send_mail(sbj, mesg): | |
if SEND_MAIL is True: | |
msg = MIMEMultipart() | |
msg['Subject'] = (sbj) | |
msg['From'] = 'root@%s' % (HOSTNAME) | |
msg['To'] = RECIPIENT[0] | |
body = mesg | |
msg.attach(MIMEText(body, 'plain')) | |
smtpconnect = smtplib.SMTP(SMTP_SRV, SMTP_PORT) | |
smtpconnect.starttls() | |
smtpconnect.login(SENDER, SMTP_PASS) | |
smtpconnect.sendmail(SENDER, RECIPIENT, msg.as_string()) | |
smtpconnect.quit() | |
else: | |
pass | |
def now_time(): | |
now = datetime.datetime.now() | |
f_time = now.strftime('%Y.%m.%d_%H:%M:%S') | |
return f_time | |
def log_write(msg, log_file=LOG): | |
log = open(log_file, 'a') | |
try: | |
log.write(now_time()+' '+msg) | |
finally: | |
log.close() | |
def check_dir(dr): | |
if os.path.isdir(dr) is False: | |
return False | |
else: | |
return True | |
def exec_command(cmd): | |
ret = subprocess.Popen(cmd, shell=True, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT) | |
out = ret.stdout.readlines() | |
ret.communicate() | |
rcode = ret.poll() | |
return out, rcode | |
def get_file_list(regexp, path, db=''): | |
'''Return a sorted list of filesystem objects | |
''' | |
if db: | |
regexp = '%s%s' % (db, regexp) | |
file_list = [s for s in os.listdir(path) if regexp in s] | |
return sorted(file_list) | |
def remove_old_files(del_list, keep_files): | |
'''Remove old files if they are more than keep_files | |
''' | |
while len(del_list) > keep_files: | |
if os.path.isfile(LOG_DIR+del_list[0]): | |
os.remove(LOG_DIR+del_list[0]) | |
if os.path.isdir(LOG_DIR+del_list[0]): | |
shutil.rmtree(LOG_DIR+del_list[0]) | |
del_list.pop(0) | |
log_write('removing %s\n' % del_list[0]) | |
def get_size(start_path): | |
total_size = 0 | |
for dirpath, dirnames, filenames in os.walk(start_path): | |
for f in filenames: | |
fp = os.path.join(dirpath, f) | |
total_size += os.path.getsize(fp) | |
return total_size | |
def main(): | |
# Mail subject template: | |
mail_sbj = '%s: pgdump_check.py status' % HOSTNAME | |
# Mark start of job in a logfile: | |
log_write('==Start pgdump_check.py for database %s==\n' % RECOVER_DB) | |
# To prevent to drop database on a production cluster, | |
# RESTORE_HOSTNAME and HOSTNAME must be the same host: | |
if RESTORE_HOSTNAME != HOSTNAME: | |
msg = 'You are trying to restore pgdump on %s. '\ | |
'Are you sure that it\'s the right server? exit\n' % HOSTNAME | |
log_write(msg) | |
send_mail(mail_sbj, msg) | |
sys.exit(1) | |
# Check postgresql processes: | |
pg_process = 'ps aux | grep postgres | '\ | |
'grep -v "bash\|grep\|su - postgres\|ps aux\|python"' | |
pg_process_status = exec_command(pg_process) | |
process_retcode = pg_process_status[1] | |
if not pg_process or process_retcode != 0: | |
msg = '%s: postgres are not running or retcode '\ | |
'of check not equal "0", see %s. exit\n' % (HOSTNAME, LOG) | |
for i in pg_process_status[0]: | |
log_write(i) | |
send_mail(mail_sbj, msg) | |
sys.exit(1) | |
# } end check | |
# Check dump directory: | |
if not check_dir(DUMP_DIR): | |
msg = '%s: %s does not exist. exit\n' % (HOSTNAME, DUMP_DIR) | |
log_write(msg) | |
send_mail(mail_sbj, msg) | |
sys.exit(1) | |
# Check RECOVER_DB. If it exists, exit: | |
db_check = 'psql -t -c "\l %s"' % RECOVER_DB | |
db_check_status = exec_command(db_check) | |
db_check_retcode = db_check_status[1] | |
if db_check_retcode == 0 and db_check_status[0][0] != '\n': | |
msg = '%s: database %s exists or this check is not possible! '\ | |
'stop recovery, exit\n' % (HOSTNAME, RECOVER_DB) | |
log_write(msg) | |
send_mail(mail_sbj, msg) | |
sys.exit(1) | |
log_write('database %s does not exist, begin recovery\n' % RECOVER_DB) | |
# Get the last dump path: | |
dump_list = get_file_list(DUMP_SUFFIX, DUMP_DIR) | |
if not dump_list: | |
msg = '%s: dump with suffix "%s" does not exist. exit\n' % ( | |
HOSTNAME, DUMP_SUFFIX) | |
log_write(msg) | |
send_mail(mail_sbj, msg) | |
sys.exit(1) | |
else: | |
last_dump = DUMP_DIR+dump_list[-1] | |
# Run pg_restore: | |
start_time = datetime.datetime.now() | |
dump_size = get_size(last_dump) / 1024 / 1024 | |
log_write('dump_size is %s MB\n' % dump_size) | |
restore_command = 'pg_restore -F d %s -j %s -C -d postgres' % ( | |
last_dump, RECOVER_JOBS) | |
restore_status = exec_command(restore_command) | |
restore_retcode = restore_status[1] | |
if restore_retcode != 0 or restore_status[0]: | |
restore_msg = ''.join(restore_status[0]) | |
log_write(restore_msg+'\n') | |
msg = '%s: restore %s failed' % (HOSTNAME, RECOVER_DB) | |
log_write(msg+'\n') | |
send_mail(mail_sbj, msg+', see %s for more info\n' % LOG) | |
sys.exit(1) | |
# Check postgres log for errors: | |
log_list = get_file_list(PGLOG_SUFFIX, PGLOG_DIR) | |
if not log_list: | |
msg = '%s: log in %s or with "%s" suffix does not exist\n' % ( | |
HOSTNAME, PGLOG_DIR, PGLOG_SUFFIX) | |
log_write(msg) | |
send_mail(mail_sbj, msg) | |
else: | |
pg_last_log = PGLOG_DIR+'/'+get_file_list(PGLOG_SUFFIX, PGLOG_DIR)[-1] | |
grep_error = 'grep "FATAL\|ERROR" %s | '\ | |
'grep -v " autovacuum "' % pg_last_log | |
grep_error_status = exec_command(grep_error) | |
grep_retcode = grep_error_status[1] | |
if grep_retcode == 0: | |
grep_msg = ''.join(grep_error_status[0]) | |
log_write(grep_msg+'\n') | |
msg = 'in %s:' % pg_last_log | |
log_write(msg+'\n') | |
send_mail(mail_sbj, grep_msg+':\n'+msg) | |
end_time = datetime.datetime.now() | |
exec_time = end_time - start_time | |
# Make and send the stat report: | |
du = exec_command('du -s %s' % PGDATA)[0] | |
cluster_size = int(du[0].split()[0]) / 1024 / 1024 | |
msg = '%s: restore %s %s done:\nexec_time=%s\ncluster_size=%sGB\n' % ( | |
HOSTNAME, RECOVER_DB, last_dump, exec_time, cluster_size) | |
log_write(msg) | |
send_mail(mail_sbj, msg) | |
# Check is finished, drop RECOVER_DB: | |
drop_command = 'psql -t -c "drop database %s;"' % RECOVER_DB | |
drop_status = exec_command(drop_command) | |
drop_retcode = drop_status[1] | |
if drop_retcode != 0: | |
drop_msg = ''.join(drop_status[0]) | |
log_write(msg+'\n') | |
msg = 'drop database %s faild\n' % (RECOVER_DB) | |
log_write(msg) | |
send_mail(mail_sbj, msg+', see %s for more info\n' % LOG) | |
sys.exit(1) | |
# Remove old logs: | |
remove_old_files(get_file_list(LOG_PREF, LOG_DIR), KEEP_FILES) | |
log_write('==All jobs are executed==\n') | |
sys.exit(0) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment