"""Locally compresses and encrypts data into cache directory, checks if the data | |
stored on S3 is different. If yes, uploads new version. Otherwise skips them. | |
Encryption provided by openssl enc, I do not feel comfortable writing my own | |
implementation. | |
Versioning is supposed to be done with s3 versioning | |
Basic process: | |
- find all subfolders / files of backup_path up to a depth of depth | |
this makes it so that folders which do not change often do not get backed up | |
all the time | |
- create a tar archive for all of these files, and get an MD5 hash of the | |
archive | |
- compare the MD5 hash either with S3 metadata, or, if defined, the hash stored | |
in dynamo_db_table. dynamodb is recommended because dynamodb lookups are much | |
cheaper | |
- if the hash for the file differs, encrypt using openssl aes-256-cbc and a 4KB | |
keyfile | |
- upload to S3 using multipart upload with key | |
[prefix]/backup_path/path/to/file.tar.bz2 | |
Known issues: | |
If there are no subfolders / files in depth n but in depth n-1 they will not be | |
backed up. Example: | |
/a/b/c | |
/a/b/d | |
/a/b/e | |
/a/f | |
Will back up the following folders: | |
depth=0: a | |
depth=1: b, f | |
depth=2: c, d, e | |
Requirements: | |
- boto3 | |
- aws credentials and regions specified in some way, so either ENV, or in | |
~.aws/config | |
- openssl command line tool | |
- S3 bucket | |
Optional: | |
- DynamoDB table, pay-per-request, partition key: s3_key | |
Permissions required: | |
data "aws_iam_policy_document" "backup_policy" { | |
statement { | |
actions = [ | |
"s3:ListAllMyBuckets", | |
"s3:GetBucketLocation", | |
] | |
resources = [ | |
"arn:aws:s3:::*", | |
] | |
} | |
statement { | |
actions = [ | |
"s3:GetObject*", | |
"s3:ListBucket", | |
"s3:PutObject*" | |
] | |
resources = [ | |
aws_s3_bucket.bucket.arn, | |
"${aws_s3_bucket.bucket.arn}/*" | |
] | |
} | |
statement { | |
actions = [ | |
"dynamoDb:GetItem", | |
"dynamoDb:PutItem", | |
"dynamoDb:UpdateItem" | |
] | |
resources = [ | |
aws_dynamodb_table.table.arn | |
] | |
} | |
} | |
Usage: | |
See help. prefix and dynamodb_table_name are optional. If dynamo_db_table_name | |
is set, will store md5 in dynamodb. If it is not set, it will use S3 metadata | |
instead, which is more expensive | |
Example terraform config: | |
variable "account_number" {} | |
variable "backup_users" { type = map } | |
provider "aws" { | |
region = xyz | |
} | |
resource "aws_s3_bucket" "bucket" { | |
bucket = "abc" | |
acl = "private" | |
region = xyz | |
server_side_encryption_configuration { | |
rule { | |
apply_server_side_encryption_by_default { | |
sse_algorithm = "AES256" | |
} | |
} | |
} | |
versioning { | |
enabled = true | |
} | |
lifecycle_rule { | |
enabled = true | |
noncurrent_version_expiration { | |
days = 180 | |
} | |
} | |
tags = { | |
project = "backup" | |
} | |
} | |
resource "aws_s3_bucket_public_access_block" "public_access_block" { | |
bucket = aws_s3_bucket.bucket.id | |
block_public_acls = true | |
block_public_policy = true | |
ignore_public_acls = true | |
restrict_public_buckets = true | |
} | |
resource "aws_dynamodb_table" "table" { | |
name = "backup" | |
billing_mode = "PAY_PER_REQUEST" | |
hash_key = "s3_key" | |
attribute { | |
name = "s3_key" | |
type = "S" | |
} | |
tags = { | |
Name = "def" | |
project = "backup" | |
} | |
} | |
resource "aws_iam_user" "users" { | |
for_each = var.backup_users | |
name = "backup_${each.key}" | |
tags = { | |
project = "backup" | |
} | |
} | |
data "aws_iam_policy_document" "backup_policy" { | |
statement { | |
actions = [ | |
"s3:ListAllMyBuckets", | |
"s3:GetBucketLocation", | |
] | |
resources = [ | |
"arn:aws:s3:::*", | |
] | |
} | |
statement { | |
actions = [ | |
"s3:GetObject*", | |
"s3:ListBucket", | |
"s3:PutObject*" | |
] | |
resources = [ | |
aws_s3_bucket.bucket.arn, | |
"${aws_s3_bucket.bucket.arn}/*" | |
] | |
} | |
statement { | |
actions = [ | |
"dynamoDb:GetItem", | |
"dynamoDb:PutItem", | |
"dynamoDb:UpdateItem" | |
] | |
resources = [ | |
aws_dynamodb_table.table.arn | |
] | |
} | |
} | |
resource "aws_iam_user_policy" "backup" { | |
for_each = var.backup_users | |
name = aws_iam_user.users[each.key].name | |
user = aws_iam_user.users[each.key].name | |
policy = data.aws_iam_policy_document.backup_policy.json | |
} | |
""" | |
import argparse | |
import glob | |
import logging | |
import os | |
import sys | |
import tarfile | |
from hashlib import md5 | |
from subprocess import check_output | |
import boto3 | |
import botocore | |
from boto3.s3.transfer import TransferConfig | |
logging.basicConfig(level=logging.INFO) | |
def get_subpaths(path, depth): | |
"""Returns list of all files and directories | |
up to depth <depth> | |
Arguments: | |
path {str} -- Filesystem path | |
depth {int} -- Depth to which paths should be returned | |
Returns: | |
[type] -- [description] | |
""" | |
# shamelessly stolen from stackoverflow user phihag | |
# https://stackoverflow.com/a/7159726/3785588 | |
glob_pattern = path + '/*' * depth | |
return glob.glob(glob_pattern) | |
def compress(path, cache_path, algorithm): | |
"""Creates a tarfile compressed with algorithm | |
and the basename of path to cache_path | |
Arguments: | |
path {str} -- Path to compress | |
cache_path {str} -- Path where tar should be stored | |
algorithm {str} -- Compression algorithm [gzip bz2 lzma] | |
Returns: | |
str: Path to tar file | |
""" | |
# translate algorithm into modes understood by tarfile | |
algorithms = { | |
'gzip': { | |
'filetype': 'gz', | |
'mode': 'w:gz' | |
}, | |
'bzip2': { | |
'filetype': 'bz2', | |
'mode': 'w:bz2' | |
}, | |
'lzma': { | |
'filetype': 'xz', | |
'mode': 'w:xz' | |
} | |
} | |
try: | |
filetype = algorithms[algorithm]['filetype'] | |
mode = algorithms[algorithm]['mode'] | |
except: | |
logging.error('Unknown compression algorithm: ' + algorithm) | |
sys.exit(1) | |
# gives a path like cache_path/somefile.tar.lzma | |
tarpath = os.path.join(cache_path, os.path.basename(path)) + '.tar.' + filetype | |
logging.info('TARing {} to {}'.format(path, tarpath)) | |
with tarfile.open(tarpath, mode=mode) as tar: | |
tar.add(path, arcname=os.path.basename(path)) | |
return tarpath | |
def get_md5(path): | |
"""Returns md5 of path | |
Arguments: | |
path {str} -- Path to file | |
Returns: | |
str -- md5 hash | |
""" | |
# shamelessly stolen from quantumSoup | |
# https://stackoverflow.com/a/3431838/3785588 | |
hash_md5 = md5() | |
with open(path, "rb") as f: | |
for chunk in iter(lambda: f.read(4096), b""): | |
hash_md5.update(chunk) | |
logging.info('MD5 of {} is {}'.format(path, hash_md5.hexdigest())) | |
return hash_md5.hexdigest() | |
def get_md5_from_dynamodb(s3_key, dynamodb_table_name): | |
"""Returns DynamoDB property md5 of s3_key | |
Arguments: | |
s3_key {str} -- S3 key | |
dynamodb_table_name {str} -- name of dynamodb table | |
Returns: | |
str -- md5 hash as stored in DynamoDB, '0' if key does not exist | |
""" | |
client = boto3.client('dynamodb') | |
resp = client.get_item( | |
TableName = dynamodb_table_name, | |
Key = { | |
's3_key': { | |
'S': s3_key | |
} | |
}, | |
AttributesToGet = [ 'md5' ] | |
) | |
# it could be that this is the first time uploading this object, in this | |
# case the partition key s3_key will not exist so we return a dummy md5 | |
if not 'Item' in resp: | |
md5_s3 = '0' | |
else: | |
md5_s3 = resp['Item']['md5']['S'] | |
return md5_s3 | |
def get_md5_from_s3(s3_key, bucket_name): | |
"""Returns md5 hash stored in S3 metadata | |
Arguments: | |
s3_key {str} -- s3 key | |
bucket_name {str} -- name of bucket | |
Returns: | |
str -- md5 hash as stored in S3 metadata, '0' if key does not exist | |
""" | |
client = boto3.client('s3') | |
# we do a head request to get the metadata | |
try: | |
head_resp = client.head_object( | |
Bucket = bucket_name, | |
Key = s3_key | |
) | |
except botocore.exceptions.ClientError as e: | |
# if the object does not exist, we set the md5 to a dummy value | |
if e.response['Error']['Code'] == "404": | |
s3_md5 = '0' | |
else: | |
# if it does exist we can extract the md5 hash we set during upload | |
if 'md5' in head_resp['Metadata']: | |
s3_md5 = head_resp['Metadata']['md5'] | |
else: | |
s3_md5 = '0' | |
return s3_md5 | |
def needs_reupload(s3_key, md5_local_filesystem, bucket_name, dynamodb_table_name): | |
"""Returns true of the file does either not exist on S3 or the hash is | |
different | |
Arguments: | |
s3_key {str} -- s3 key | |
md5_local_filesystem {str} -- md5 hash of the local file | |
bucket_name {str} -- name of target bucket | |
dynamodb_table_name {str} -- either name of dynnamodb table or empty | |
string if dynamodb not used | |
Returns: | |
[type] -- [description] | |
""" | |
if dynamodb_table_name == '': | |
md5_s3 = get_md5_from_s3(s3_key, bucket_name) | |
else: | |
md5_s3 = get_md5_from_dynamodb(s3_key, dynamodb_table_name) | |
logging.info('MD5 on S3 is: {}'.format(md5_s3)) | |
return md5_local_filesystem != md5_s3 | |
def encrypt(tarpath, keyfile): | |
"""Encrypts tarpath using keyfile and openssl | |
Arguments: | |
tarpath {str} -- path to file to encrypt | |
keyfile {str} -- path to keyfile will be created if it does not exist | |
""" | |
# generate keyfile if it does not exist | |
if not os.path.exists(keyfile): | |
logging.info('Keyfile does not exist. Generating 4096 Byte key.') | |
with open(keyfile, 'wb') as f: | |
f.write(os.urandom(4096)) | |
# theoretically, one could use AES primitives for encryption but that is | |
# hard to get right and easy to get wrong, so we just use openssl instead | |
# this also means you can decrypt your files on the commandline | |
logging.info('Encrypting') | |
command = 'openssl enc -aes-256-cbc -pbkdf2 -pass file:{} -in {} -out {}'.format( | |
keyfile, | |
tarpath, | |
tarpath + '.aes' | |
) | |
output = check_output(command.split(' ')) | |
logging.debug('OpenSSL output: {}'.format(output)) | |
# after encrypting, remove the old file | |
os.remove(tarpath) | |
def upload(s3_key, path, bucket_name, md5_local_filesystem, dynamodb_table_name): | |
"""Uploads file to S3, supporting multi-part upload | |
Arguments: | |
s3_key {str} -- s3 key | |
path {str} -- path on local storage to upload | |
bucket_name {str} -- bucket name | |
md5_local_filesystem {str} -- md5 hash to put into s3 / dynamodb | |
dynamodb_table_name {str} -- name of the dynamoDB table. will not be | |
used if empty | |
""" | |
s3 = boto3.resource('s3') | |
# for large files, we are using multipart upload, for this we need to set | |
# config | |
transfer_config = TransferConfig( | |
multipart_threshold = 1024 * 25, | |
max_concurrency = 10, | |
multipart_chunksize = 1024 * 25, | |
use_threads = True | |
) | |
logging.info('Uploading {} to s3://{}/{}'.format( | |
path, | |
bucket_name, | |
s3_key | |
)) | |
s3.meta.client.upload_file( | |
Bucket = bucket_name, | |
Config = transfer_config, | |
Filename = path, | |
Key = s3_key, | |
ExtraArgs = { | |
'ACL': 'private', | |
'Metadata': { 'md5': md5_local_filesystem }, | |
'StorageClass': 'GLACIER' | |
} | |
) | |
if not dynamodb_table_name == '': | |
set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name) | |
def set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name): | |
"""Updates data on DynamoDB for specific s3_key | |
Arguments: | |
s3_key {str} -- s3 key | |
md5_local_filesystem {str} -- md5 hash to put on dynamodb | |
dynamodb_table_name {str} -- Name of dynamoDB table | |
""" | |
client = boto3.client('dynamodb') | |
logging.info('Updating DynamoDB entry for {}'.format(s3_key)) | |
resp = client.put_item( | |
TableName = dynamodb_table_name, | |
Item = { | |
's3_key': { | |
'S': s3_key | |
}, | |
'md5': { | |
'S': md5_local_filesystem | |
} | |
} | |
) | |
if resp['ResponseMetadata']['HTTPStatusCode'] > 399: | |
logging.error('DynamoDB response code: {}'.format( | |
resp['ResponseMetadata']['HTTPStatusCode'] | |
)) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--backup_path', type=str, required=True, help='Path that should be backed up') | |
parser.add_argument('--cache_path', type=str, required=True, help='Path that should be backed up') | |
parser.add_argument('--bucket_name', type=str, required=True, help='Bucket to back up to') | |
parser.add_argument('--prefix', type=str, default='', help='Optional prefix prepended to all S3 keys') | |
parser.add_argument('--dynamodb_table_name', type=str, required=False, default='', help='Optional name of DynamoDB table to store metadata') | |
parser.add_argument('--depth', type=int, required=True, help='How deep to descend into directories before creating archives. Depth of 0 means only backup-path will be compressed, dept of 1 means, every file / folder within backup-path will get its own archive, etc.') | |
parser.add_argument('--compression_algorithm', type=str, required=False, default='bzip2', choices=['lzma', 'bzip2', 'gzip'], help='Compression algorithm to use.') | |
parser.add_argument('--keyfile', type=str, required=True, help='Path to keyfile. Will be generated if it does not exists') | |
args = parser.parse_args() | |
# first, get a list of all the paths that should be backed up individually | |
paths_to_backup = get_subpaths(args.backup_path, args.depth) | |
if not paths_to_backup: | |
logging.error('path {} not found or empty'.format(args.backup_path)) | |
for path in paths_to_backup: | |
# compress every path into an unencrypted tar file | |
tarpath = compress(path, args.cache_path, args.compression_algorithm) | |
# this is the md5 hash we will use to check if the local file is | |
# different from the file on S3. we have to do this before encryption | |
# since AES uses an initialization vector (IV), which changes the hash | |
# of an encrypted file, even if you encrypt the same data twice | |
md5_local_filesystem = get_md5(tarpath) | |
# create the S3 key, which is the path the file will be stored in in S3 | |
# this is of format [/prefix]/path/filename | |
# so if backing up /home/myuser/mydir/something, with prefix home, | |
# key will be home//home/myuser/mydir/something.tar.xz on a unix system | |
# on windows this will not happen because os.path uses \\. so we have to | |
# do a little dirty hack here with the replace // at the end | |
s3_key = '{}/{}/{}'.format( | |
args.prefix, | |
os.path.dirname(path).replace('\\', '/'), | |
os.path.basename(tarpath) | |
).replace('//', '/') | |
# s3_keys should not start with a / otherwise you have en empty prefix | |
# it will work but its confusing | |
if s3_key.startswith('/'): | |
s3_key = s3_key[1:] | |
# this method checks if the file actually needs to be reuploaded | |
if needs_reupload(s3_key, md5_local_filesystem, args.bucket_name, args.dynamodb_table_name): | |
logging.info('File needs to be uploaded') | |
# encrypt encrypts the file into a new file with ending .aes using | |
# keyfile as the pass, then deletes the original tar file | |
encrypt(tarpath, args.keyfile) | |
upload(s3_key, tarpath + '.aes', args.bucket_name, md5_local_filesystem, args.dynamodb_table_name) | |
os.remove(tarpath + '.aes') | |
else: | |
logging.info('MD5 on S3 is identical to local version. Skipping') | |
os.remove(tarpath) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment