ktraff/Amazon S3 Backup.py

## Amazon S3 Backup.py
"""
A script that backs up the database and sends the compressed output to an S3 bucket
"""
import os
import sys
import glob
import subprocess
import contextlib
import functools
import multiprocessing
from multiprocessing.pool import IMapIterator
import rfc822
import time
import datetime
from dateutil.relativedelta import relativedelta
import boto
import uuid
import logging


# Define variables here.
NOW = datetime.datetime.now()  # Used for filenames etc.
USERFILES_PATHS = ["/my/list/of/folders/and/files/to/copy"]
RESULTS_PATH = "/tmp/" + NOW.strftime("%Y_%m_%d") + "-backup.tar.gz"
BACKUP_PATH = "/raid"
TWO_WEEKS_AGO=datetime.datetime.now() - datetime.timedelta(days=14)

# The number of days to keep local backups in external storage.
ARCHIVE_DAYS = 14


AWS_ACCESS_KEY_ID='my-access-key'
AWS_SECRET_ACCESS_KEY='my-secret-access-key'
AWS_BUCKET_NAME = "my-s3-bucket"
# The name of the file to be stored in the Amazon S3 bucket, called a "key".
AWS_KEYNAME=NOW.strftime("%Y_%m_%d-backup") + ".tar.gz"
# The number of CPU cores with which to split the pieces of the tarball.
CORES=1
# If True, stores the file in a "Reduced Redundancy" class in S3.
USE_REDUCED_REDUNDANCY=False
# The size in MB of each piece of the file to upload.
MB_SIZE=100


@contextlib.contextmanager
def multimap(cores=None):
    if cores is None:
        cores = max(multiprocessing.cpu_count() - 1, 1)
    def wrapper(func):
        def wrap(self, timeout=None):
            return func(self, timeout=timeout if timeout is not None else 1e100)
        return wrap
    IMapIterator.next = wrapper(IMapIterator.next)
    pool = multiprocessing.Pool(cores)
    yield pool.imap
    pool.terminate()


def map_wrap(f):
    @functools.wraps(f)
    def wrapper(*args, **kwargs):
        return apply(f, *args, **kwargs)
    return wrapper


def mp_from_ids(mp_id, mp_keyname, mp_bucketname):
    conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.lookup(mp_bucketname)
    mp = boto.s3.multipart.MultiPartUpload(bucket)
    mp.key_name = mp_keyname
    mp.id = mp_id
    return mp


def split_file(in_file, mb_size, split_num=5):
    prefix = os.path.join(os.path.dirname(in_file),
                          "%sS3PART" % (os.path.basename(AWS_KEYNAME)))
    split_size = int(min(mb_size / (split_num * 2.0), 250))
    if not os.path.exists("%saa" % prefix):
        cl = ["split", "-d", "--suffix-length=5", "-b%sm" % split_size, in_file, prefix]
        subprocess.check_call(cl)
    return sorted(glob.glob("%s*" % prefix))


@map_wrap
def transfer_part(mp_id, mp_keyname, mp_bucketname, i, part):
    mp = mp_from_ids(mp_id, mp_keyname, mp_bucketname)
    logger.info(" Transferring " +  str(i) + str(part))
    with open(part) as t_handle:
        mp.upload_part_from_file(t_handle, i+1)
    os.remove(part)

# Compress everything together.
result = subprocess.call(['tar', 'czf', RESULTS_PATH] + USERFILES_PATHS)

if not result:
    # The backup completed successfully.

    # Instantiate a new client for Amazon Simple Storage Service (S3). With no
    # parameters or configuration, the AWS SDK for Python (Boto) will look for
    # access keys in these environment variables:
    # For more information about this interface to Amazon S3, see:
    # http://boto.readthedocs.org/en/latest/s3_tut.html
    # Configure a host with the kwarg host="s3-us-west-2.amazonaws.com"
    s3 = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

    # Everything uploaded to Amazon S3 must belong to a bucket. These buckets are
    # in the global namespace, and must have a unique name.
    #
    # For more information about bucket name restrictions, see:
    # http://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html
    logger.info("Opening bucket with name: " + AWS_BUCKET_NAME)
    bucket = s3.get_bucket(AWS_BUCKET_NAME)

    # Files in Amazon S3 are called "objects" and are stored in buckets. A specific
    # object is referred to by its key (i.e., name) and holds data. Here, we create
    # a new object with the key "python_sample_key.txt" and content "Hello World!".
    #
    # For more information on keys and set_contents_from_string, see:
    # http://boto.readthedocs.org/en/latest/s3_tut.html#storing-data
    from boto.s3.key import Key
    k = Key(bucket)
    k.key = AWS_KEYNAME

    # Initiate a multi-part upload to prevent connection loss / slow data transfer.
    mp = bucket.initiate_multipart_upload(AWS_KEYNAME, reduced_redundancy=USE_REDUCED_REDUNDANCY)
    with multimap(CORES) as pmap:
        for _ in pmap(transfer_part, ((mp.id, mp.key_name, mp.bucket_name, i, part)
                                      for (i, part) in
                                      enumerate(split_file(RESULTS_PATH, MB_SIZE, CORES)))):
            pass
    mp.complete_upload()

    # Fetch the key to show that we stored something. Key.generate_url will
    # construct a URL that can be used to access the object for a limited time.
    # Here, we set it to expire in 30 minutes.
    #
    # For a more detailed overview of generate_url's options, see:
    # http://boto.readthedocs.org/en/latest/ref/s3.html#boto.s3.key.Key.generate_url
    expires_in_seconds = 1800

    # Delete old backups.
    for key in bucket.list():
        date = datetime.datetime.strptime(key.key[:10], "%Y_%m_%d")
        if date < TWO_WEEKS_AGO:
            # Deleting backup.
            key.delete()

# Copying tarball to backup location.
result = subprocess.call(['mv', RESULTS_PATH, BACKUP_PATH])

# Deleting old local backups.
result = subprocess.call(["find", BACKUP_PATH, "-maxdepth", "1",
                          "-mtime", "+" + str(ARCHIVE_DAYS),
                          "-exec", "rm", "-r", "{}", ";"])
	"""
	A script that backs up the database and sends the compressed output to an S3 bucket
	"""
	import os
	import sys
	import glob
	import subprocess
	import contextlib
	import functools
	import multiprocessing
	from multiprocessing.pool import IMapIterator
	import rfc822
	import time
	import datetime
	from dateutil.relativedelta import relativedelta
	import boto
	import uuid
	import logging


	# Define variables here.
	NOW = datetime.datetime.now() # Used for filenames etc.
	USERFILES_PATHS = ["/my/list/of/folders/and/files/to/copy"]
	RESULTS_PATH = "/tmp/" + NOW.strftime("%Y_%m_%d") + "-backup.tar.gz"
	BACKUP_PATH = "/raid"
	TWO_WEEKS_AGO=datetime.datetime.now() - datetime.timedelta(days=14)

	# The number of days to keep local backups in external storage.
	ARCHIVE_DAYS = 14


	AWS_ACCESS_KEY_ID='my-access-key'
	AWS_SECRET_ACCESS_KEY='my-secret-access-key'
	AWS_BUCKET_NAME = "my-s3-bucket"
	# The name of the file to be stored in the Amazon S3 bucket, called a "key".
	AWS_KEYNAME=NOW.strftime("%Y_%m_%d-backup") + ".tar.gz"
	# The number of CPU cores with which to split the pieces of the tarball.
	CORES=1
	# If True, stores the file in a "Reduced Redundancy" class in S3.
	USE_REDUCED_REDUNDANCY=False
	# The size in MB of each piece of the file to upload.
	MB_SIZE=100


	@contextlib.contextmanager
	def multimap(cores=None):
	if cores is None:
	cores = max(multiprocessing.cpu_count() - 1, 1)
	def wrapper(func):
	def wrap(self, timeout=None):
	return func(self, timeout=timeout if timeout is not None else 1e100)
	return wrap
	IMapIterator.next = wrapper(IMapIterator.next)
	pool = multiprocessing.Pool(cores)
	yield pool.imap
	pool.terminate()


	def map_wrap(f):
	@functools.wraps(f)
	def wrapper(args, *kwargs):
	return apply(f, args, *kwargs)
	return wrapper


	def mp_from_ids(mp_id, mp_keyname, mp_bucketname):
	conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
	bucket = conn.lookup(mp_bucketname)
	mp = boto.s3.multipart.MultiPartUpload(bucket)
	mp.key_name = mp_keyname
	mp.id = mp_id
	return mp


	def split_file(in_file, mb_size, split_num=5):
	prefix = os.path.join(os.path.dirname(in_file),
	"%sS3PART" % (os.path.basename(AWS_KEYNAME)))
	split_size = int(min(mb_size / (split_num * 2.0), 250))
	if not os.path.exists("%saa" % prefix):
	cl = ["split", "-d", "--suffix-length=5", "-b%sm" % split_size, in_file, prefix]
	subprocess.check_call(cl)
	return sorted(glob.glob("%s*" % prefix))


	@map_wrap
	def transfer_part(mp_id, mp_keyname, mp_bucketname, i, part):
	mp = mp_from_ids(mp_id, mp_keyname, mp_bucketname)
	logger.info(" Transferring " + str(i) + str(part))
	with open(part) as t_handle:
	mp.upload_part_from_file(t_handle, i+1)
	os.remove(part)

	# Compress everything together.
	result = subprocess.call(['tar', 'czf', RESULTS_PATH] + USERFILES_PATHS)

	if not result:
	# The backup completed successfully.

	# Instantiate a new client for Amazon Simple Storage Service (S3). With no
	# parameters or configuration, the AWS SDK for Python (Boto) will look for
	# access keys in these environment variables:
	# For more information about this interface to Amazon S3, see:
	# http://boto.readthedocs.org/en/latest/s3_tut.html
	# Configure a host with the kwarg host="s3-us-west-2.amazonaws.com"
	s3 = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

	# Everything uploaded to Amazon S3 must belong to a bucket. These buckets are
	# in the global namespace, and must have a unique name.
	#
	# For more information about bucket name restrictions, see:
	# http://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html
	logger.info("Opening bucket with name: " + AWS_BUCKET_NAME)
	bucket = s3.get_bucket(AWS_BUCKET_NAME)

	# Files in Amazon S3 are called "objects" and are stored in buckets. A specific
	# object is referred to by its key (i.e., name) and holds data. Here, we create
	# a new object with the key "python_sample_key.txt" and content "Hello World!".
	#
	# For more information on keys and set_contents_from_string, see:
	# http://boto.readthedocs.org/en/latest/s3_tut.html#storing-data
	from boto.s3.key import Key
	k = Key(bucket)
	k.key = AWS_KEYNAME

	# Initiate a multi-part upload to prevent connection loss / slow data transfer.
	mp = bucket.initiate_multipart_upload(AWS_KEYNAME, reduced_redundancy=USE_REDUCED_REDUNDANCY)
	with multimap(CORES) as pmap:
	for _ in pmap(transfer_part, ((mp.id, mp.key_name, mp.bucket_name, i, part)
	for (i, part) in
	enumerate(split_file(RESULTS_PATH, MB_SIZE, CORES)))):
	pass
	mp.complete_upload()

	# Fetch the key to show that we stored something. Key.generate_url will
	# construct a URL that can be used to access the object for a limited time.
	# Here, we set it to expire in 30 minutes.
	#
	# For a more detailed overview of generate_url's options, see:
	# http://boto.readthedocs.org/en/latest/ref/s3.html#boto.s3.key.Key.generate_url
	expires_in_seconds = 1800

	# Delete old backups.
	for key in bucket.list():
	date = datetime.datetime.strptime(key.key[:10], "%Y_%m_%d")
	if date < TWO_WEEKS_AGO:
	# Deleting backup.
	key.delete()

	# Copying tarball to backup location.
	result = subprocess.call(['mv', RESULTS_PATH, BACKUP_PATH])

	# Deleting old local backups.
	result = subprocess.call(["find", BACKUP_PATH, "-maxdepth", "1",
	"-mtime", "+" + str(ARCHIVE_DAYS),
	"-exec", "rm", "-r", "{}", ";"])