Skip to content

Instantly share code, notes, and snippets.

Created November 4, 2011 19:15
Show Gist options
  • Save itdaniher/1340224 to your computer and use it in GitHub Desktop.
Save itdaniher/1340224 to your computer and use it in GitHub Desktop.
rsync backup script using hardlinks, written by someone in python
#!/usr/bin/env python
# Script for automatic backup of files to a fileserver using rsync.
# Directories to backup and details about the remote fileserver and
# exclude patterns must be entered in the backup script itself.
# See 'User Configurable Parameters' below.
# Backup consists of three functions controlled by commandline arguments:
# /etc/backup sync
# Performs the incremental rsync of select directories
# Can be run frequently e.g. every half hour
# /etc/backup snapshot
# Rotates backups to retain snapshots back in time
# Should be run every 2 or 4 hours
# /etc/backup weedout
# Deletes superfluous snapshots.
# Should be run every day after midnight
# Without arguments backup will do all of the above.
# The script can be executed either manually or
# as a cron job. E.g to run backup every twenty minutes during the day and
# every two hours at night,
#3,23,43 8-23 * * * root /etc/backup sync > /var/log/backup_sync.lastlog
#3 0-7/2 * * * root /etc/backup sync > /var/log/backup_sync.lastlog
# Make snapshot of backup tree every two hours 6 min past the hour
#6 */2 * * * root /etc/backup snapshot > /var/log/backup_snapshot.lastlog
# Weed-out superfluous snapshots every night 1:09
#9 1 * * * root /etc/backup weedout > /var/log/backup_weedout.lastlog#
# The cronjob versions require that ssh and rsync can access the
# fileserver without prompting for a password.
# This can be accomplished by using ssh-agent or ssh-add with a
# blank passphrase
# Ole Nielsen ANU, May 2002
# User Configurable Parameters
remotedir = '/home/it/backup/TRAITOR' # Remote directory
username = 'it' # Remote username
fileserver = 'it-nepal'
#fileserver = ''
# SOURCE - Specify directories to backup (without trailing slash)
backup_dirs = ['/Users']
# General exclude patterns. See man rsync for details on usage.
exclude_list = ["*~", ".*~", "#*#", ".#*", "*.o", "Dropbox", "Downloads", "Music", ".Trash", ".dropbox", "Library", "Public", "Photos", "Movies"]
# Exclude everything in /var except log and spool/mail
exclude_list += []
include_list = []
use_gnu = 0 # Set to 1 only if using GNU on remote system
dryrun = 0 # Don't actually do it - for testing purposes
verbose = 2 # Verbose output. 0: Nothing, 1: Some, 2: Everything
# Delays (in each timeslot: year, month, week, day, hour) before
# old backups get weeded out. For example [0,0,0,0,0].
delay = [15,5,2,2,1]
# Base file name for temporary storage on local host
tmpfile_basename = '/tmp/backup_files'
# The program
# Get commandline arguments
import sys
if len(sys.argv) == 2:
arg = sys.argv[1]
#arg = 'all'
arg = 'sync' #Default to sync (quick)
# Aliases
if arg == 'rsync': arg = 'sync'
if arg == 'weed': arg = 'weedout'
if arg == 'snap': arg = 'snapshot'
if arg == 'rotate': arg = 'snapshot'
if arg == 'stat': arg = 'stats'
errmsg = 'Invalid command line argument %s' %arg
assert arg in ['all', 'sync', 'snapshot', 'weedout', 'stats'], errmsg
def make_timeslot(time_tuple):
"""Return [year, month, week, day, hour]
Must be organised from slow to fast measure.
return [time_tuple[0], time_tuple[1], int(time_tuple[7] / 7),\
time_tuple[2], time_tuple[3]]
# Has to be consitent with make_timeslot for reporting purposes!
timeslot_names = ['yearly', 'monthly', 'weekly', 'daily', 'hourly']
# Backup machinery
import sys, time, os, string
t_start = time.time()
# Form destination dir based on current time stamp
time_tuple = time.localtime(t_start)
time_stamp = time.strftime('%Y-%m-%d_%H:%M:%S',time_tuple)
destination = remotedir + '/' + 'Backup.' + time_stamp
most_recent_hardlink = remotedir + '/MOST_RECENT'
tmpfile = tmpfile_basename + '.' + time_stamp
if use_gnu:
copycmd = 'cp -al %s %s' %(most_recent_hardlink, destination)
copycmd = 'mkdir %s; cd %s && find . -print | cpio -dpl %s'\
%(destination, most_recent_hardlink, destination)
#Verify existence of remote account
remote_account = username + '@' + fileserver
exitcode = os.system('ssh %s "hostname" >/dev/null' %remote_account)
if exitcode != 0:
raise "Remote account %s could not be accessed" %remote_account
#Verify existence of remote backup directory. Create if necessary
exitcode = os.system('ssh %s "cd %s"' %(remote_account, remotedir))
if exitcode != 0:
exitcode = os.system('ssh %s "mkdir %s"' %(remote_account, remotedir))
if exitcode != 0:
raise 'Could not create remote directory %s' %remotedir
if arg in ['sync', 'snapshot', 'all']:
# Check that previous backup is finished
lockfile = '/var/lock/backup.lock'
fid = open(lockfile,'r')
backup_time = fid.readline().strip()
backup_in_progress = 1
backup_in_progress = 0
if backup_in_progress:
s = "%s: Rotating backups still in progress since %s.\n"\
%(time_stamp, backup_time)
s += "Please wait till previous backup has completed.\n"
s += "If this is wrong, please delete %s and try again." %lockfile
# raise s
# Make lock
if verbose > 0: print 'Making lock file %s' %lockfile
fid = open(lockfile, 'w')
fid.write(arg + ':' + time_stamp + '\n')
# Write to log file
fid = open('/var/log/backup.log', 'a')
fid.write('Started backup %s at %s\n' %(arg, time.asctime()))
if arg in ['sync', 'snapshot', 'all']:
# Form include/exclude string and options
exclude_string = ''
for pattern in exclude_list:
exclude_string += '--exclude "%s" ' %pattern
include_string = ''
for pattern in include_list:
include_string += '--include "%s" ' %pattern
rsync_long_options = '%s %s --delete --delete-excluded --delete-after'\
%(include_string, exclude_string)
if verbose == 2:
rsync_short_options = '-azv'
rsync_short_options = '-az'
remote_account = username + '@' + fileserver
# Start backing up
for dir in backup_dirs:
cmd = 'rsync %s %s %s %s@%s:%s'\
%(rsync_short_options, rsync_long_options, dir,\
username, fileserver, most_recent_hardlink)
if verbose > 0:
print cmd
if not dryrun:
exitcode = os.system(cmd)
if exitcode != 0 and verbose > 0:
print '\nWARNING (backup): Problems copying directory %s to %s'\
%(dir, fileserver)
print ' This can for example happen if user'
print ' is not allowed to read all of %s' %(dir)
# Update time stamp on newly created backup
cmd = 'ssh %s "touch %s"' %(remote_account, most_recent_hardlink)
exitcode = os.system(cmd)
if verbose > 0: print "rsync completed in %d seconds" %(time.time() - t_start)
if arg in ['snapshot', 'all']:
t_snap = time.time()
# Make hard links from most recent backup to name with time stamp
cmd = 'ssh %s "%s"' %(remote_account, copycmd)
if verbose > 0:
print cmd
exitcode = os.system(cmd)
if verbose > 0: print "hardlink rotation completed in %d seconds"\
%(time.time() - t_snap)
if arg in ['weedout', 'all']:
#House keeping - weed out among older backups
t_weed = time.time()
current_timeslot = make_timeslot(time_tuple)
# Create a dictionary for each timeslot
timedict = []
for c in current_timeslot:
keeplist = []
# Get all backup directories and extract their time stamps
cmd = 'ssh %s "%s" > %s' %(remote_account, 'ls %s' %remotedir, tmpfile)
exitcode = os.system(cmd)
fid = open(tmpfile, 'r')
for line in fid.readlines():
filename = line.strip()
field_list = filename.split('.') #Extract extension (time stamp)
processed = 0
if len(field_list) > 1:
stamp = field_list[1]
if stamp is not None:
time_tuple = time.strptime(stamp, '%Y-%m-%d_%H:%M:%S')
print 'Warning:' + stamp + ' could not be parsed'
# Organise files in the various time slots
# ordered as year, month, week, day, hour
# as specified in make_timeslot
timeslot = make_timeslot(time_tuple)
for i in range(len(timeslot)):
if timeslot[i] < current_timeslot[i] - delay[i]:
# File is older than current time slot - delay,
# put it into appropriate slot.
if not timedict[i].has_key(timeslot[i]):
timedict[i][timeslot[i]] = {}
timedict[i][timeslot[i]][filename] = time.mktime(time_tuple)
processed = 1
break # Do not enter file into more than one slot
if not processed: keeplist.append(filename)
# Keep only the newest from each list
delete_files = []
i = 0
for ttt in timedict:
for flist in ttt.values():
if len(flist) > 0:
# Sort
V = flist.values()
F = flist.keys()
A = zip(V,F)
keepfile = A[-1][1]
tobe_deleted = map(lambda x: x[1], A[:-1])
print 'Expired time slot (%s)' %timeslot_names[i]
if len(tobe_deleted) > 0:
print ' Delete: ', tobe_deleted
print ' Keep: ', keepfile
delete_files += tobe_deleted #Accumulate
i += 1
print 'To be deleted:'
print delete_files
print 'To be kept:'
print keeplist
# Delete superfluous files
if len(delete_files) > 0:
delete_string = string.join(delete_files)
cmd = 'ssh %s "cd %s; /bin/rm -rf %s"'\
%(remote_account, remotedir, delete_string)
if verbose > 0: print cmd
if not dryrun: os.system(cmd)
if verbose > 0: print "superfluous files deleted in %d seconds"\
%(time.time() - t_weed)
if arg in ['stats', 'all']:
# Get some stats on disk usage
t_stats = time.time()
# Number of backups
cmd = 'ssh %s "%s" > %s' %(remote_account, 'ls %s' %remotedir, tmpfile)
exitcode = os.system(cmd)
fid = open(tmpfile, 'r')
files = fid.readlines()
print 'You currently have %d backups' %(len(files))
cmd = 'ssh %s "%s" > %s' %(remote_account, 'du -sh %s/MOST_RECENT' %remotedir, tmpfile)
if verbose: print cmd
exitcode = os.system(cmd)
cmd = 'ssh %s "%s" >> %s' %(remote_account, 'du -sh %s' %remotedir, tmpfile)
if verbose: print cmd
exitcode = os.system(cmd)
fid = open(tmpfile, 'r')
lines = fid.readlines()
if len(lines) == 2:
size1 = lines[0].strip().split()[0]
size2 = lines[1].strip().split()[0]
print 'Size of latest backup: %s ' %size1
print 'Size of all backups: %s ' %size2
if verbose > 0: print "Stats obtained in %d seconds"\
%(time.time() - t_stats)
fid = open('/var/log/backup.log', 'a')
fid.write('Finished backup %s in %d seconds at %s\n'\
%(arg, time.time()-t_start,time.asctime()))
print 'WARNING: Could not open log file /var/log/backup.log'
if arg in ['sync', 'snapshot', 'all']:
# Remove lockfile
os.system('/bin/rm -r %s' %lockfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment