mzhaase/s3_backup.py

## s3_backup.py
"""Locally compresses and encrypts data into cache directory, checks if the data
stored on S3 is different. If yes, uploads new version. Otherwise skips them.
Encryption provided by openssl enc, I do not feel comfortable writing my own
implementation.

Versioning is supposed to be done with s3 versioning

Basic process:
- find all subfolders / files of backup_path up to a depth of depth
  this makes it so that folders which do not change often do not get backed up
  all the time
- create a tar archive for all of these files, and get an MD5 hash of the
  archive
- compare the MD5 hash either with S3 metadata, or, if defined, the hash stored
  in dynamo_db_table. dynamodb is recommended because dynamodb lookups are much
  cheaper
- if the hash for the file differs, encrypt using openssl aes-256-cbc and a 4KB
  keyfile
- upload to S3 using multipart upload with key
  [prefix]/backup_path/path/to/file.tar.bz2

Known issues:
If there are no subfolders / files in depth n but in depth n-1 they will not be
backed up. Example:

/a/b/c
/a/b/d
/a/b/e
/a/f

Will back up the following folders:
depth=0: a
depth=1: b, f
depth=2: c, d, e

Requirements:
- boto3
- aws credentials and regions specified in some way, so either ENV, or in
  ~.aws/config
- openssl command line tool
- S3 bucket

Optional:
- DynamoDB table, pay-per-request, partition key: s3_key

Permissions required:

data "aws_iam_policy_document" "backup_policy" {
  statement {
    actions = [
      "s3:ListAllMyBuckets",
      "s3:GetBucketLocation",
    ]

    resources = [
      "arn:aws:s3:::*",
    ]
  }

  statement {
    actions = [
      "s3:GetObject*",
      "s3:ListBucket",
      "s3:PutObject*"
    ]

    resources = [
      aws_s3_bucket.bucket.arn,
      "${aws_s3_bucket.bucket.arn}/*"
    ]
  }

  statement {
    actions = [
      "dynamoDb:GetItem",
      "dynamoDb:PutItem",
      "dynamoDb:UpdateItem"
    ]
    resources = [
      aws_dynamodb_table.table.arn
    ]
  }
}

Usage:
See help. prefix and dynamodb_table_name are optional. If dynamo_db_table_name
is set, will store md5 in dynamodb. If it is not set, it will use S3 metadata
instead, which is more expensive

Example terraform config:
variable "account_number" {}
variable "backup_users" { type = map }

provider "aws" {
  region = xyz
}

resource "aws_s3_bucket" "bucket" {
  bucket = "abc"
  acl    = "private"
  region = xyz
  server_side_encryption_configuration {
    rule {
      apply_server_side_encryption_by_default {
        sse_algorithm = "AES256"
      }
    }
  }
  versioning {
    enabled = true
  }
  lifecycle_rule {
    enabled = true

    noncurrent_version_expiration {
      days = 180
    }
  }
  tags = {
    project = "backup"
  }
}

resource "aws_s3_bucket_public_access_block" "public_access_block" {
  bucket = aws_s3_bucket.bucket.id

  block_public_acls       = true
  block_public_policy     = true
  ignore_public_acls      = true
  restrict_public_buckets = true
}

resource "aws_dynamodb_table" "table" {
  name           = "backup"
  billing_mode   = "PAY_PER_REQUEST"
  hash_key       = "s3_key"

  attribute {
    name = "s3_key"
    type = "S"
  }

  tags = {
    Name    = "def"
    project = "backup"
  }
}

resource "aws_iam_user" "users" {
  for_each = var.backup_users
  name     = "backup_${each.key}"
  tags = {
    project = "backup"
  }
}

data "aws_iam_policy_document" "backup_policy" {
  statement {
    actions = [
      "s3:ListAllMyBuckets",
      "s3:GetBucketLocation",
    ]

    resources = [
      "arn:aws:s3:::*",
    ]
  }

  statement {
    actions = [
      "s3:GetObject*",
      "s3:ListBucket",
      "s3:PutObject*"
    ]

    resources = [
      aws_s3_bucket.bucket.arn,
      "${aws_s3_bucket.bucket.arn}/*"
    ]
  }

  statement {
    actions = [
      "dynamoDb:GetItem",
      "dynamoDb:PutItem",
      "dynamoDb:UpdateItem"
    ]
    resources = [
      aws_dynamodb_table.table.arn
    ]
  }
}

resource "aws_iam_user_policy" "backup" {
  for_each = var.backup_users
  name     = aws_iam_user.users[each.key].name
  user     = aws_iam_user.users[each.key].name

  policy = data.aws_iam_policy_document.backup_policy.json
}
"""
import argparse
import glob
import logging
import os
import sys
import tarfile

from hashlib import md5
from subprocess import check_output

import boto3
import botocore

from boto3.s3.transfer import TransferConfig

logging.basicConfig(level=logging.INFO)

def get_subpaths(path, depth):
    """Returns list of all files and directories
    up to depth <depth>

    Arguments:
        path {str} -- Filesystem path
        depth {int} -- Depth to which paths should be returned

    Returns:
        [type] -- [description]
    """
    # shamelessly stolen from stackoverflow user phihag
    # https://stackoverflow.com/a/7159726/3785588
    glob_pattern = path + '/*' * depth
    return glob.glob(glob_pattern)

def compress(path, cache_path, algorithm):
    """Creates a tarfile compressed with algorithm
    and the basename of path to cache_path

    Arguments:
        path {str} -- Path to compress
        cache_path {str} -- Path where tar should be stored
        algorithm {str} -- Compression algorithm [gzip bz2 lzma]

    Returns:
        str: Path to tar file
    """
    # translate algorithm into modes understood by tarfile
    algorithms = {
        'gzip': {
            'filetype': 'gz',
            'mode': 'w:gz'
        },
        'bzip2': {
            'filetype': 'bz2',
            'mode': 'w:bz2'
        },
        'lzma': {
            'filetype': 'xz',
            'mode': 'w:xz'
        }
    }
    try:
        filetype = algorithms[algorithm]['filetype']
        mode     = algorithms[algorithm]['mode']
    except:
        logging.error('Unknown compression algorithm: ' + algorithm)
        sys.exit(1)
    # gives a path like cache_path/somefile.tar.lzma
    tarpath = os.path.join(cache_path, os.path.basename(path)) + '.tar.'  + filetype
    logging.info('TARing {} to {}'.format(path, tarpath))
    with tarfile.open(tarpath, mode=mode) as tar:
        tar.add(path, arcname=os.path.basename(path))
    return tarpath

def get_md5(path):
    """Returns md5 of path

    Arguments:
        path {str} -- Path to file

    Returns:
        str -- md5 hash
    """
    # shamelessly stolen from quantumSoup
    # https://stackoverflow.com/a/3431838/3785588
    hash_md5 = md5()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    logging.info('MD5 of {} is {}'.format(path, hash_md5.hexdigest()))
    return hash_md5.hexdigest()

def get_md5_from_dynamodb(s3_key, dynamodb_table_name):
    """Returns DynamoDB property md5 of s3_key

    Arguments:
        s3_key {str} -- S3 key
        dynamodb_table_name {str} -- name of dynamodb table

    Returns:
        str -- md5 hash as stored in DynamoDB, '0' if key does not exist
    """
    client = boto3.client('dynamodb')
    resp   = client.get_item(
        TableName       = dynamodb_table_name,
        Key             = {
            's3_key': {
                'S': s3_key
            }
        },
        AttributesToGet = [ 'md5' ]
    )
    # it could be that this is the first time uploading this object, in this
    # case the partition key s3_key will not exist so we return a dummy md5
    if not 'Item' in resp:
        md5_s3 = '0'
    else:
        md5_s3 = resp['Item']['md5']['S']
    return md5_s3

def get_md5_from_s3(s3_key, bucket_name):
    """Returns md5 hash stored in S3 metadata

    Arguments:
        s3_key {str} -- s3 key
        bucket_name {str} -- name of bucket

    Returns:
        str -- md5 hash as stored in S3 metadata, '0' if key does not exist
    """
    client = boto3.client('s3')
    # we do a head request to get the metadata
    try:
        head_resp = client.head_object(
            Bucket = bucket_name,
            Key    = s3_key
        )
    except botocore.exceptions.ClientError as e:
        # if the object does not exist, we set the md5 to a dummy value
        if e.response['Error']['Code'] == "404":
            s3_md5 = '0'
    else:
        # if it does exist we can extract the md5 hash we set during upload
        if 'md5' in head_resp['Metadata']:
            s3_md5 = head_resp['Metadata']['md5']
        else:
            s3_md5 = '0'
    return s3_md5


def needs_reupload(s3_key, md5_local_filesystem, bucket_name, dynamodb_table_name):
    """Returns true of the file does either not exist on S3 or the hash is
    different

    Arguments:
        s3_key {str} -- s3 key
        md5_local_filesystem {str} -- md5 hash of the local file
        bucket_name {str} -- name of target bucket
        dynamodb_table_name {str} -- either name of dynnamodb table or empty
          string if dynamodb not used

    Returns:
        [type] -- [description]
    """
    if dynamodb_table_name == '':
        md5_s3 = get_md5_from_s3(s3_key, bucket_name)
    else:
        md5_s3 = get_md5_from_dynamodb(s3_key, dynamodb_table_name)
    logging.info('MD5 on S3 is: {}'.format(md5_s3))
    return md5_local_filesystem != md5_s3

def encrypt(tarpath, keyfile):
    """Encrypts tarpath using keyfile and openssl

    Arguments:
        tarpath {str} -- path to file to encrypt
        keyfile {str} -- path to keyfile will be created if it does not exist
    """
    # generate keyfile if it does not exist
    if not os.path.exists(keyfile):
        logging.info('Keyfile does not exist. Generating 4096 Byte key.')
        with open(keyfile, 'wb') as f:
            f.write(os.urandom(4096))
    # theoretically, one could use AES primitives for encryption but that is
    # hard to get right and easy to get wrong, so we just use openssl instead
    # this also means you can decrypt your files on the commandline
    logging.info('Encrypting')
    command = 'openssl enc -aes-256-cbc -pbkdf2 -pass file:{} -in {} -out {}'.format(
        keyfile,
        tarpath,
        tarpath + '.aes'
    )
    output = check_output(command.split(' '))
    logging.debug('OpenSSL output: {}'.format(output))
    # after encrypting, remove the old file
    os.remove(tarpath)

def upload(s3_key, path, bucket_name, md5_local_filesystem, dynamodb_table_name):
    """Uploads file to S3, supporting multi-part upload

    Arguments:
        s3_key {str} -- s3 key
        path {str} -- path on local storage to upload
        bucket_name {str} -- bucket name
        md5_local_filesystem {str} -- md5 hash to put into s3 / dynamodb
        dynamodb_table_name {str} -- name of the dynamoDB table. will not be
          used if empty
    """
    s3 = boto3.resource('s3')

    # for large files, we are using multipart upload, for this we need to set
    # config
    transfer_config = TransferConfig(
        multipart_threshold = 1024 * 25,
        max_concurrency     = 10,
        multipart_chunksize = 1024 * 25,
        use_threads         = True
    )
    logging.info('Uploading {} to s3://{}/{}'.format(
        path,
        bucket_name,
        s3_key
    ))
    s3.meta.client.upload_file(
        Bucket    = bucket_name,
        Config    = transfer_config,
        Filename  = path,
        Key       = s3_key,
        ExtraArgs = {
            'ACL': 'private',
            'Metadata': { 'md5': md5_local_filesystem },
            'StorageClass': 'GLACIER'
        }
    )
    if not dynamodb_table_name == '':
        set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name)

def set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name):
    """Updates data on DynamoDB for specific s3_key

    Arguments:
        s3_key {str} -- s3 key
        md5_local_filesystem {str} -- md5 hash to put on dynamodb
        dynamodb_table_name {str} -- Name of dynamoDB table
    """
    client = boto3.client('dynamodb')
    logging.info('Updating DynamoDB entry for {}'.format(s3_key))
    resp = client.put_item(
        TableName = dynamodb_table_name,
        Item      = {
            's3_key': {
                'S': s3_key
            },
            'md5': {
                'S': md5_local_filesystem
            }
        }
    )
    if resp['ResponseMetadata']['HTTPStatusCode'] > 399:
        logging.error('DynamoDB response code: {}'.format(
            resp['ResponseMetadata']['HTTPStatusCode']
        ))

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--backup_path', type=str, required=True, help='Path that should be backed up')
    parser.add_argument('--cache_path', type=str, required=True, help='Path that should be backed up')
    parser.add_argument('--bucket_name', type=str, required=True, help='Bucket to back up to')
    parser.add_argument('--prefix', type=str, default='', help='Optional prefix prepended to all S3 keys')
    parser.add_argument('--dynamodb_table_name', type=str, required=False, default='', help='Optional name of DynamoDB table to store metadata')
    parser.add_argument('--depth', type=int, required=True, help='How deep to descend into directories before creating archives. Depth of 0 means only backup-path will be compressed, dept of 1 means, every file / folder within backup-path will get its own archive, etc.')
    parser.add_argument('--compression_algorithm', type=str, required=False, default='bzip2', choices=['lzma', 'bzip2', 'gzip'], help='Compression algorithm to use.')
    parser.add_argument('--keyfile', type=str, required=True, help='Path to keyfile. Will be generated if it does not exists')
    args = parser.parse_args()

    # first, get a list of all the paths that should be backed up individually
    paths_to_backup = get_subpaths(args.backup_path, args.depth)
    if not paths_to_backup:
        logging.error('path {} not found or empty'.format(args.backup_path))
    for path in paths_to_backup:
        # compress every path into an unencrypted tar file
        tarpath = compress(path, args.cache_path, args.compression_algorithm)
        # this is the md5 hash we will use to check if the local file is
        # different from the file on S3. we have to do this before encryption
        # since AES uses an initialization vector (IV), which changes the hash
        # of an encrypted file, even if you encrypt the same data twice
        md5_local_filesystem = get_md5(tarpath)
        # create the S3 key, which is the path the file will be stored in in S3
        # this is of format [/prefix]/path/filename
        # so if backing up /home/myuser/mydir/something, with prefix home,
        # key will be home//home/myuser/mydir/something.tar.xz on a unix system
        # on windows this will not happen because os.path uses \\. so we have to
        # do a little dirty hack here with the replace // at the end
        s3_key = '{}/{}/{}'.format(
            args.prefix,
            os.path.dirname(path).replace('\\', '/'),
            os.path.basename(tarpath)
        ).replace('//', '/')
        # s3_keys should not start with a / otherwise you have en empty prefix
        # it will work but its confusing
        if s3_key.startswith('/'):
            s3_key = s3_key[1:]

        # this method checks if the file actually needs to be reuploaded
        if needs_reupload(s3_key, md5_local_filesystem, args.bucket_name, args.dynamodb_table_name):
            logging.info('File needs to be uploaded')
            # encrypt encrypts the file into a new file with ending .aes using
            # keyfile as the pass, then deletes the original tar file
            encrypt(tarpath, args.keyfile)
            upload(s3_key, tarpath + '.aes', args.bucket_name, md5_local_filesystem, args.dynamodb_table_name)
            os.remove(tarpath + '.aes')
        else:
            logging.info('MD5 on S3 is identical to local version. Skipping')
            os.remove(tarpath)

if __name__ == "__main__":
    main()
	"""Locally compresses and encrypts data into cache directory, checks if the data
	stored on S3 is different. If yes, uploads new version. Otherwise skips them.
	Encryption provided by openssl enc, I do not feel comfortable writing my own
	implementation.

	Versioning is supposed to be done with s3 versioning

	Basic process:
	- find all subfolders / files of backup_path up to a depth of depth
	this makes it so that folders which do not change often do not get backed up
	all the time
	- create a tar archive for all of these files, and get an MD5 hash of the
	archive
	- compare the MD5 hash either with S3 metadata, or, if defined, the hash stored
	in dynamo_db_table. dynamodb is recommended because dynamodb lookups are much
	cheaper
	- if the hash for the file differs, encrypt using openssl aes-256-cbc and a 4KB
	keyfile
	- upload to S3 using multipart upload with key
	[prefix]/backup_path/path/to/file.tar.bz2

	Known issues:
	If there are no subfolders / files in depth n but in depth n-1 they will not be
	backed up. Example:

	/a/b/c
	/a/b/d
	/a/b/e
	/a/f

	Will back up the following folders:
	depth=0: a
	depth=1: b, f
	depth=2: c, d, e

	Requirements:
	- boto3
	- aws credentials and regions specified in some way, so either ENV, or in
	~.aws/config
	- openssl command line tool
	- S3 bucket

	Optional:
	- DynamoDB table, pay-per-request, partition key: s3_key

	Permissions required:

	data "aws_iam_policy_document" "backup_policy" {
	statement {
	actions = [
	"s3:ListAllMyBuckets",
	"s3:GetBucketLocation",
	]

	resources = [
	"arn:aws:s3:::*",
	]
	}

	statement {
	actions = [
	"s3:GetObject*",
	"s3:ListBucket",
	"s3:PutObject*"
	]

	resources = [
	aws_s3_bucket.bucket.arn,
	"${aws_s3_bucket.bucket.arn}/*"
	]
	}

	statement {
	actions = [
	"dynamoDb:GetItem",
	"dynamoDb:PutItem",
	"dynamoDb:UpdateItem"
	]
	resources = [
	aws_dynamodb_table.table.arn
	]
	}
	}

	Usage:
	See help. prefix and dynamodb_table_name are optional. If dynamo_db_table_name
	is set, will store md5 in dynamodb. If it is not set, it will use S3 metadata
	instead, which is more expensive

	Example terraform config:
	variable "account_number" {}
	variable "backup_users" { type = map }

	provider "aws" {
	region = xyz
	}

	resource "aws_s3_bucket" "bucket" {
	bucket = "abc"
	acl = "private"
	region = xyz
	server_side_encryption_configuration {
	rule {
	apply_server_side_encryption_by_default {
	sse_algorithm = "AES256"
	}
	}
	}
	versioning {
	enabled = true
	}
	lifecycle_rule {
	enabled = true

	noncurrent_version_expiration {
	days = 180
	}
	}
	tags = {
	project = "backup"
	}
	}

	resource "aws_s3_bucket_public_access_block" "public_access_block" {
	bucket = aws_s3_bucket.bucket.id

	block_public_acls = true
	block_public_policy = true
	ignore_public_acls = true
	restrict_public_buckets = true
	}

	resource "aws_dynamodb_table" "table" {
	name = "backup"
	billing_mode = "PAY_PER_REQUEST"
	hash_key = "s3_key"

	attribute {
	name = "s3_key"
	type = "S"
	}

	tags = {
	Name = "def"
	project = "backup"
	}
	}

	resource "aws_iam_user" "users" {
	for_each = var.backup_users
	name = "backup_${each.key}"
	tags = {
	project = "backup"
	}
	}

	data "aws_iam_policy_document" "backup_policy" {
	statement {
	actions = [
	"s3:ListAllMyBuckets",
	"s3:GetBucketLocation",
	]

	resources = [
	"arn:aws:s3:::*",
	]
	}

	statement {
	actions = [
	"s3:GetObject*",
	"s3:ListBucket",
	"s3:PutObject*"
	]

	resources = [
	aws_s3_bucket.bucket.arn,
	"${aws_s3_bucket.bucket.arn}/*"
	]
	}

	statement {
	actions = [
	"dynamoDb:GetItem",
	"dynamoDb:PutItem",
	"dynamoDb:UpdateItem"
	]
	resources = [
	aws_dynamodb_table.table.arn
	]
	}
	}

	resource "aws_iam_user_policy" "backup" {
	for_each = var.backup_users
	name = aws_iam_user.users[each.key].name
	user = aws_iam_user.users[each.key].name

	policy = data.aws_iam_policy_document.backup_policy.json
	}
	"""
	import argparse
	import glob
	import logging
	import os
	import sys
	import tarfile

	from hashlib import md5
	from subprocess import check_output

	import boto3
	import botocore

	from boto3.s3.transfer import TransferConfig

	logging.basicConfig(level=logging.INFO)

	def get_subpaths(path, depth):
	"""Returns list of all files and directories
	up to depth <depth>

	Arguments:
	path {str} -- Filesystem path
	depth {int} -- Depth to which paths should be returned

	Returns:
	[type] -- [description]
	"""
	# shamelessly stolen from stackoverflow user phihag
	# https://stackoverflow.com/a/7159726/3785588
	glob_pattern = path + '/' depth
	return glob.glob(glob_pattern)

	def compress(path, cache_path, algorithm):
	"""Creates a tarfile compressed with algorithm
	and the basename of path to cache_path

	Arguments:
	path {str} -- Path to compress
	cache_path {str} -- Path where tar should be stored
	algorithm {str} -- Compression algorithm [gzip bz2 lzma]

	Returns:
	str: Path to tar file
	"""
	# translate algorithm into modes understood by tarfile
	algorithms = {
	'gzip': {
	'filetype': 'gz',
	'mode': 'w:gz'
	},
	'bzip2': {
	'filetype': 'bz2',
	'mode': 'w:bz2'
	},
	'lzma': {
	'filetype': 'xz',
	'mode': 'w:xz'
	}
	}
	try:
	filetype = algorithms[algorithm]['filetype']
	mode = algorithms[algorithm]['mode']
	except:
	logging.error('Unknown compression algorithm: ' + algorithm)
	sys.exit(1)
	# gives a path like cache_path/somefile.tar.lzma
	tarpath = os.path.join(cache_path, os.path.basename(path)) + '.tar.' + filetype
	logging.info('TARing {} to {}'.format(path, tarpath))
	with tarfile.open(tarpath, mode=mode) as tar:
	tar.add(path, arcname=os.path.basename(path))
	return tarpath

	def get_md5(path):
	"""Returns md5 of path

	Arguments:
	path {str} -- Path to file

	Returns:
	str -- md5 hash
	"""
	# shamelessly stolen from quantumSoup
	# https://stackoverflow.com/a/3431838/3785588
	hash_md5 = md5()
	with open(path, "rb") as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hash_md5.update(chunk)
	logging.info('MD5 of {} is {}'.format(path, hash_md5.hexdigest()))
	return hash_md5.hexdigest()

	def get_md5_from_dynamodb(s3_key, dynamodb_table_name):
	"""Returns DynamoDB property md5 of s3_key

	Arguments:
	s3_key {str} -- S3 key
	dynamodb_table_name {str} -- name of dynamodb table

	Returns:
	str -- md5 hash as stored in DynamoDB, '0' if key does not exist
	"""
	client = boto3.client('dynamodb')
	resp = client.get_item(
	TableName = dynamodb_table_name,
	Key = {
	's3_key': {
	'S': s3_key
	}
	},
	AttributesToGet = [ 'md5' ]
	)
	# it could be that this is the first time uploading this object, in this
	# case the partition key s3_key will not exist so we return a dummy md5
	if not 'Item' in resp:
	md5_s3 = '0'
	else:
	md5_s3 = resp['Item']['md5']['S']
	return md5_s3

	def get_md5_from_s3(s3_key, bucket_name):
	"""Returns md5 hash stored in S3 metadata

	Arguments:
	s3_key {str} -- s3 key
	bucket_name {str} -- name of bucket

	Returns:
	str -- md5 hash as stored in S3 metadata, '0' if key does not exist
	"""
	client = boto3.client('s3')
	# we do a head request to get the metadata
	try:
	head_resp = client.head_object(
	Bucket = bucket_name,
	Key = s3_key
	)
	except botocore.exceptions.ClientError as e:
	# if the object does not exist, we set the md5 to a dummy value
	if e.response['Error']['Code'] == "404":
	s3_md5 = '0'
	else:
	# if it does exist we can extract the md5 hash we set during upload
	if 'md5' in head_resp['Metadata']:
	s3_md5 = head_resp['Metadata']['md5']
	else:
	s3_md5 = '0'
	return s3_md5


	def needs_reupload(s3_key, md5_local_filesystem, bucket_name, dynamodb_table_name):
	"""Returns true of the file does either not exist on S3 or the hash is
	different

	Arguments:
	s3_key {str} -- s3 key
	md5_local_filesystem {str} -- md5 hash of the local file
	bucket_name {str} -- name of target bucket
	dynamodb_table_name {str} -- either name of dynnamodb table or empty
	string if dynamodb not used

	Returns:
	[type] -- [description]
	"""
	if dynamodb_table_name == '':
	md5_s3 = get_md5_from_s3(s3_key, bucket_name)
	else:
	md5_s3 = get_md5_from_dynamodb(s3_key, dynamodb_table_name)
	logging.info('MD5 on S3 is: {}'.format(md5_s3))
	return md5_local_filesystem != md5_s3

	def encrypt(tarpath, keyfile):
	"""Encrypts tarpath using keyfile and openssl

	Arguments:
	tarpath {str} -- path to file to encrypt
	keyfile {str} -- path to keyfile will be created if it does not exist
	"""
	# generate keyfile if it does not exist
	if not os.path.exists(keyfile):
	logging.info('Keyfile does not exist. Generating 4096 Byte key.')
	with open(keyfile, 'wb') as f:
	f.write(os.urandom(4096))
	# theoretically, one could use AES primitives for encryption but that is
	# hard to get right and easy to get wrong, so we just use openssl instead
	# this also means you can decrypt your files on the commandline
	logging.info('Encrypting')
	command = 'openssl enc -aes-256-cbc -pbkdf2 -pass file:{} -in {} -out {}'.format(
	keyfile,
	tarpath,
	tarpath + '.aes'
	)
	output = check_output(command.split(' '))
	logging.debug('OpenSSL output: {}'.format(output))
	# after encrypting, remove the old file
	os.remove(tarpath)

	def upload(s3_key, path, bucket_name, md5_local_filesystem, dynamodb_table_name):
	"""Uploads file to S3, supporting multi-part upload

	Arguments:
	s3_key {str} -- s3 key
	path {str} -- path on local storage to upload
	bucket_name {str} -- bucket name
	md5_local_filesystem {str} -- md5 hash to put into s3 / dynamodb
	dynamodb_table_name {str} -- name of the dynamoDB table. will not be
	used if empty
	"""
	s3 = boto3.resource('s3')

	# for large files, we are using multipart upload, for this we need to set
	# config
	transfer_config = TransferConfig(
	multipart_threshold = 1024 * 25,
	max_concurrency = 10,
	multipart_chunksize = 1024 * 25,
	use_threads = True
	)
	logging.info('Uploading {} to s3://{}/{}'.format(
	path,
	bucket_name,
	s3_key
	))
	s3.meta.client.upload_file(
	Bucket = bucket_name,
	Config = transfer_config,
	Filename = path,
	Key = s3_key,
	ExtraArgs = {
	'ACL': 'private',
	'Metadata': { 'md5': md5_local_filesystem },
	'StorageClass': 'GLACIER'
	}
	)
	if not dynamodb_table_name == '':
	set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name)

	def set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name):
	"""Updates data on DynamoDB for specific s3_key

	Arguments:
	s3_key {str} -- s3 key
	md5_local_filesystem {str} -- md5 hash to put on dynamodb
	dynamodb_table_name {str} -- Name of dynamoDB table
	"""
	client = boto3.client('dynamodb')
	logging.info('Updating DynamoDB entry for {}'.format(s3_key))
	resp = client.put_item(
	TableName = dynamodb_table_name,
	Item = {
	's3_key': {
	'S': s3_key
	},
	'md5': {
	'S': md5_local_filesystem
	}
	}
	)
	if resp['ResponseMetadata']['HTTPStatusCode'] > 399:
	logging.error('DynamoDB response code: {}'.format(
	resp['ResponseMetadata']['HTTPStatusCode']
	))

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--backup_path', type=str, required=True, help='Path that should be backed up')
	parser.add_argument('--cache_path', type=str, required=True, help='Path that should be backed up')
	parser.add_argument('--bucket_name', type=str, required=True, help='Bucket to back up to')
	parser.add_argument('--prefix', type=str, default='', help='Optional prefix prepended to all S3 keys')
	parser.add_argument('--dynamodb_table_name', type=str, required=False, default='', help='Optional name of DynamoDB table to store metadata')
	parser.add_argument('--depth', type=int, required=True, help='How deep to descend into directories before creating archives. Depth of 0 means only backup-path will be compressed, dept of 1 means, every file / folder within backup-path will get its own archive, etc.')
	parser.add_argument('--compression_algorithm', type=str, required=False, default='bzip2', choices=['lzma', 'bzip2', 'gzip'], help='Compression algorithm to use.')
	parser.add_argument('--keyfile', type=str, required=True, help='Path to keyfile. Will be generated if it does not exists')
	args = parser.parse_args()

	# first, get a list of all the paths that should be backed up individually
	paths_to_backup = get_subpaths(args.backup_path, args.depth)
	if not paths_to_backup:
	logging.error('path {} not found or empty'.format(args.backup_path))
	for path in paths_to_backup:
	# compress every path into an unencrypted tar file
	tarpath = compress(path, args.cache_path, args.compression_algorithm)
	# this is the md5 hash we will use to check if the local file is
	# different from the file on S3. we have to do this before encryption
	# since AES uses an initialization vector (IV), which changes the hash
	# of an encrypted file, even if you encrypt the same data twice
	md5_local_filesystem = get_md5(tarpath)
	# create the S3 key, which is the path the file will be stored in in S3
	# this is of format [/prefix]/path/filename
	# so if backing up /home/myuser/mydir/something, with prefix home,
	# key will be home//home/myuser/mydir/something.tar.xz on a unix system
	# on windows this will not happen because os.path uses \\. so we have to
	# do a little dirty hack here with the replace // at the end
	s3_key = '{}/{}/{}'.format(
	args.prefix,
	os.path.dirname(path).replace('\\', '/'),
	os.path.basename(tarpath)
	).replace('//', '/')
	# s3_keys should not start with a / otherwise you have en empty prefix
	# it will work but its confusing
	if s3_key.startswith('/'):
	s3_key = s3_key[1:]

	# this method checks if the file actually needs to be reuploaded
	if needs_reupload(s3_key, md5_local_filesystem, args.bucket_name, args.dynamodb_table_name):
	logging.info('File needs to be uploaded')
	# encrypt encrypts the file into a new file with ending .aes using
	# keyfile as the pass, then deletes the original tar file
	encrypt(tarpath, args.keyfile)
	upload(s3_key, tarpath + '.aes', args.bucket_name, md5_local_filesystem, args.dynamodb_table_name)
	os.remove(tarpath + '.aes')
	else:
	logging.info('MD5 on S3 is identical to local version. Skipping')
	os.remove(tarpath)

	if __name__ == "__main__":
	main()