Created
March 12, 2020 19:55
-
-
Save mzhaase/0a62d9398010d7e035796274f84143e6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Locally compresses and encrypts data into cache directory, checks if the data | |
stored on S3 is different. If yes, uploads new version. Otherwise skips them. | |
Encryption provided by openssl enc, I do not feel comfortable writing my own | |
implementation. | |
Versioning is supposed to be done with s3 versioning | |
Basic process: | |
- find all subfolders / files of backup_path up to a depth of depth | |
this makes it so that folders which do not change often do not get backed up | |
all the time | |
- create a tar archive for all of these files, and get an MD5 hash of the | |
archive | |
- compare the MD5 hash either with S3 metadata, or, if defined, the hash stored | |
in dynamo_db_table. dynamodb is recommended because dynamodb lookups are much | |
cheaper | |
- if the hash for the file differs, encrypt using openssl aes-256-cbc and a 4KB | |
keyfile | |
- upload to S3 using multipart upload with key | |
[prefix]/backup_path/path/to/file.tar.bz2 | |
Known issues: | |
If there are no subfolders / files in depth n but in depth n-1 they will not be | |
backed up. Example: | |
/a/b/c | |
/a/b/d | |
/a/b/e | |
/a/f | |
Will back up the following folders: | |
depth=0: a | |
depth=1: b, f | |
depth=2: c, d, e | |
Requirements: | |
- boto3 | |
- aws credentials and regions specified in some way, so either ENV, or in | |
~.aws/config | |
- openssl command line tool | |
- S3 bucket | |
Optional: | |
- DynamoDB table, pay-per-request, partition key: s3_key | |
Permissions required: | |
data "aws_iam_policy_document" "backup_policy" { | |
statement { | |
actions = [ | |
"s3:ListAllMyBuckets", | |
"s3:GetBucketLocation", | |
] | |
resources = [ | |
"arn:aws:s3:::*", | |
] | |
} | |
statement { | |
actions = [ | |
"s3:GetObject*", | |
"s3:ListBucket", | |
"s3:PutObject*" | |
] | |
resources = [ | |
aws_s3_bucket.bucket.arn, | |
"${aws_s3_bucket.bucket.arn}/*" | |
] | |
} | |
statement { | |
actions = [ | |
"dynamoDb:GetItem", | |
"dynamoDb:PutItem", | |
"dynamoDb:UpdateItem" | |
] | |
resources = [ | |
aws_dynamodb_table.table.arn | |
] | |
} | |
} | |
Usage: | |
See help. prefix and dynamodb_table_name are optional. If dynamo_db_table_name | |
is set, will store md5 in dynamodb. If it is not set, it will use S3 metadata | |
instead, which is more expensive | |
Example terraform config: | |
variable "account_number" {} | |
variable "backup_users" { type = map } | |
provider "aws" { | |
region = xyz | |
} | |
resource "aws_s3_bucket" "bucket" { | |
bucket = "abc" | |
acl = "private" | |
region = xyz | |
server_side_encryption_configuration { | |
rule { | |
apply_server_side_encryption_by_default { | |
sse_algorithm = "AES256" | |
} | |
} | |
} | |
versioning { | |
enabled = true | |
} | |
lifecycle_rule { | |
enabled = true | |
noncurrent_version_expiration { | |
days = 180 | |
} | |
} | |
tags = { | |
project = "backup" | |
} | |
} | |
resource "aws_s3_bucket_public_access_block" "public_access_block" { | |
bucket = aws_s3_bucket.bucket.id | |
block_public_acls = true | |
block_public_policy = true | |
ignore_public_acls = true | |
restrict_public_buckets = true | |
} | |
resource "aws_dynamodb_table" "table" { | |
name = "backup" | |
billing_mode = "PAY_PER_REQUEST" | |
hash_key = "s3_key" | |
attribute { | |
name = "s3_key" | |
type = "S" | |
} | |
tags = { | |
Name = "def" | |
project = "backup" | |
} | |
} | |
resource "aws_iam_user" "users" { | |
for_each = var.backup_users | |
name = "backup_${each.key}" | |
tags = { | |
project = "backup" | |
} | |
} | |
data "aws_iam_policy_document" "backup_policy" { | |
statement { | |
actions = [ | |
"s3:ListAllMyBuckets", | |
"s3:GetBucketLocation", | |
] | |
resources = [ | |
"arn:aws:s3:::*", | |
] | |
} | |
statement { | |
actions = [ | |
"s3:GetObject*", | |
"s3:ListBucket", | |
"s3:PutObject*" | |
] | |
resources = [ | |
aws_s3_bucket.bucket.arn, | |
"${aws_s3_bucket.bucket.arn}/*" | |
] | |
} | |
statement { | |
actions = [ | |
"dynamoDb:GetItem", | |
"dynamoDb:PutItem", | |
"dynamoDb:UpdateItem" | |
] | |
resources = [ | |
aws_dynamodb_table.table.arn | |
] | |
} | |
} | |
resource "aws_iam_user_policy" "backup" { | |
for_each = var.backup_users | |
name = aws_iam_user.users[each.key].name | |
user = aws_iam_user.users[each.key].name | |
policy = data.aws_iam_policy_document.backup_policy.json | |
} | |
""" | |
import argparse | |
import glob | |
import logging | |
import os | |
import sys | |
import tarfile | |
from hashlib import md5 | |
from subprocess import check_output | |
import boto3 | |
import botocore | |
from boto3.s3.transfer import TransferConfig | |
logging.basicConfig(level=logging.INFO) | |
def get_subpaths(path, depth): | |
"""Returns list of all files and directories | |
up to depth <depth> | |
Arguments: | |
path {str} -- Filesystem path | |
depth {int} -- Depth to which paths should be returned | |
Returns: | |
[type] -- [description] | |
""" | |
# shamelessly stolen from stackoverflow user phihag | |
# https://stackoverflow.com/a/7159726/3785588 | |
glob_pattern = path + '/*' * depth | |
return glob.glob(glob_pattern) | |
def compress(path, cache_path, algorithm): | |
"""Creates a tarfile compressed with algorithm | |
and the basename of path to cache_path | |
Arguments: | |
path {str} -- Path to compress | |
cache_path {str} -- Path where tar should be stored | |
algorithm {str} -- Compression algorithm [gzip bz2 lzma] | |
Returns: | |
str: Path to tar file | |
""" | |
# translate algorithm into modes understood by tarfile | |
algorithms = { | |
'gzip': { | |
'filetype': 'gz', | |
'mode': 'w:gz' | |
}, | |
'bzip2': { | |
'filetype': 'bz2', | |
'mode': 'w:bz2' | |
}, | |
'lzma': { | |
'filetype': 'xz', | |
'mode': 'w:xz' | |
} | |
} | |
try: | |
filetype = algorithms[algorithm]['filetype'] | |
mode = algorithms[algorithm]['mode'] | |
except: | |
logging.error('Unknown compression algorithm: ' + algorithm) | |
sys.exit(1) | |
# gives a path like cache_path/somefile.tar.lzma | |
tarpath = os.path.join(cache_path, os.path.basename(path)) + '.tar.' + filetype | |
logging.info('TARing {} to {}'.format(path, tarpath)) | |
with tarfile.open(tarpath, mode=mode) as tar: | |
tar.add(path, arcname=os.path.basename(path)) | |
return tarpath | |
def get_md5(path): | |
"""Returns md5 of path | |
Arguments: | |
path {str} -- Path to file | |
Returns: | |
str -- md5 hash | |
""" | |
# shamelessly stolen from quantumSoup | |
# https://stackoverflow.com/a/3431838/3785588 | |
hash_md5 = md5() | |
with open(path, "rb") as f: | |
for chunk in iter(lambda: f.read(4096), b""): | |
hash_md5.update(chunk) | |
logging.info('MD5 of {} is {}'.format(path, hash_md5.hexdigest())) | |
return hash_md5.hexdigest() | |
def get_md5_from_dynamodb(s3_key, dynamodb_table_name): | |
"""Returns DynamoDB property md5 of s3_key | |
Arguments: | |
s3_key {str} -- S3 key | |
dynamodb_table_name {str} -- name of dynamodb table | |
Returns: | |
str -- md5 hash as stored in DynamoDB, '0' if key does not exist | |
""" | |
client = boto3.client('dynamodb') | |
resp = client.get_item( | |
TableName = dynamodb_table_name, | |
Key = { | |
's3_key': { | |
'S': s3_key | |
} | |
}, | |
AttributesToGet = [ 'md5' ] | |
) | |
# it could be that this is the first time uploading this object, in this | |
# case the partition key s3_key will not exist so we return a dummy md5 | |
if not 'Item' in resp: | |
md5_s3 = '0' | |
else: | |
md5_s3 = resp['Item']['md5']['S'] | |
return md5_s3 | |
def get_md5_from_s3(s3_key, bucket_name): | |
"""Returns md5 hash stored in S3 metadata | |
Arguments: | |
s3_key {str} -- s3 key | |
bucket_name {str} -- name of bucket | |
Returns: | |
str -- md5 hash as stored in S3 metadata, '0' if key does not exist | |
""" | |
client = boto3.client('s3') | |
# we do a head request to get the metadata | |
try: | |
head_resp = client.head_object( | |
Bucket = bucket_name, | |
Key = s3_key | |
) | |
except botocore.exceptions.ClientError as e: | |
# if the object does not exist, we set the md5 to a dummy value | |
if e.response['Error']['Code'] == "404": | |
s3_md5 = '0' | |
else: | |
# if it does exist we can extract the md5 hash we set during upload | |
if 'md5' in head_resp['Metadata']: | |
s3_md5 = head_resp['Metadata']['md5'] | |
else: | |
s3_md5 = '0' | |
return s3_md5 | |
def needs_reupload(s3_key, md5_local_filesystem, bucket_name, dynamodb_table_name): | |
"""Returns true of the file does either not exist on S3 or the hash is | |
different | |
Arguments: | |
s3_key {str} -- s3 key | |
md5_local_filesystem {str} -- md5 hash of the local file | |
bucket_name {str} -- name of target bucket | |
dynamodb_table_name {str} -- either name of dynnamodb table or empty | |
string if dynamodb not used | |
Returns: | |
[type] -- [description] | |
""" | |
if dynamodb_table_name == '': | |
md5_s3 = get_md5_from_s3(s3_key, bucket_name) | |
else: | |
md5_s3 = get_md5_from_dynamodb(s3_key, dynamodb_table_name) | |
logging.info('MD5 on S3 is: {}'.format(md5_s3)) | |
return md5_local_filesystem != md5_s3 | |
def encrypt(tarpath, keyfile): | |
"""Encrypts tarpath using keyfile and openssl | |
Arguments: | |
tarpath {str} -- path to file to encrypt | |
keyfile {str} -- path to keyfile will be created if it does not exist | |
""" | |
# generate keyfile if it does not exist | |
if not os.path.exists(keyfile): | |
logging.info('Keyfile does not exist. Generating 4096 Byte key.') | |
with open(keyfile, 'wb') as f: | |
f.write(os.urandom(4096)) | |
# theoretically, one could use AES primitives for encryption but that is | |
# hard to get right and easy to get wrong, so we just use openssl instead | |
# this also means you can decrypt your files on the commandline | |
logging.info('Encrypting') | |
command = 'openssl enc -aes-256-cbc -pbkdf2 -pass file:{} -in {} -out {}'.format( | |
keyfile, | |
tarpath, | |
tarpath + '.aes' | |
) | |
output = check_output(command.split(' ')) | |
logging.debug('OpenSSL output: {}'.format(output)) | |
# after encrypting, remove the old file | |
os.remove(tarpath) | |
def upload(s3_key, path, bucket_name, md5_local_filesystem, dynamodb_table_name): | |
"""Uploads file to S3, supporting multi-part upload | |
Arguments: | |
s3_key {str} -- s3 key | |
path {str} -- path on local storage to upload | |
bucket_name {str} -- bucket name | |
md5_local_filesystem {str} -- md5 hash to put into s3 / dynamodb | |
dynamodb_table_name {str} -- name of the dynamoDB table. will not be | |
used if empty | |
""" | |
s3 = boto3.resource('s3') | |
# for large files, we are using multipart upload, for this we need to set | |
# config | |
transfer_config = TransferConfig( | |
multipart_threshold = 1024 * 25, | |
max_concurrency = 10, | |
multipart_chunksize = 1024 * 25, | |
use_threads = True | |
) | |
logging.info('Uploading {} to s3://{}/{}'.format( | |
path, | |
bucket_name, | |
s3_key | |
)) | |
s3.meta.client.upload_file( | |
Bucket = bucket_name, | |
Config = transfer_config, | |
Filename = path, | |
Key = s3_key, | |
ExtraArgs = { | |
'ACL': 'private', | |
'Metadata': { 'md5': md5_local_filesystem }, | |
'StorageClass': 'GLACIER' | |
} | |
) | |
if not dynamodb_table_name == '': | |
set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name) | |
def set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name): | |
"""Updates data on DynamoDB for specific s3_key | |
Arguments: | |
s3_key {str} -- s3 key | |
md5_local_filesystem {str} -- md5 hash to put on dynamodb | |
dynamodb_table_name {str} -- Name of dynamoDB table | |
""" | |
client = boto3.client('dynamodb') | |
logging.info('Updating DynamoDB entry for {}'.format(s3_key)) | |
resp = client.put_item( | |
TableName = dynamodb_table_name, | |
Item = { | |
's3_key': { | |
'S': s3_key | |
}, | |
'md5': { | |
'S': md5_local_filesystem | |
} | |
} | |
) | |
if resp['ResponseMetadata']['HTTPStatusCode'] > 399: | |
logging.error('DynamoDB response code: {}'.format( | |
resp['ResponseMetadata']['HTTPStatusCode'] | |
)) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--backup_path', type=str, required=True, help='Path that should be backed up') | |
parser.add_argument('--cache_path', type=str, required=True, help='Path that should be backed up') | |
parser.add_argument('--bucket_name', type=str, required=True, help='Bucket to back up to') | |
parser.add_argument('--prefix', type=str, default='', help='Optional prefix prepended to all S3 keys') | |
parser.add_argument('--dynamodb_table_name', type=str, required=False, default='', help='Optional name of DynamoDB table to store metadata') | |
parser.add_argument('--depth', type=int, required=True, help='How deep to descend into directories before creating archives. Depth of 0 means only backup-path will be compressed, dept of 1 means, every file / folder within backup-path will get its own archive, etc.') | |
parser.add_argument('--compression_algorithm', type=str, required=False, default='bzip2', choices=['lzma', 'bzip2', 'gzip'], help='Compression algorithm to use.') | |
parser.add_argument('--keyfile', type=str, required=True, help='Path to keyfile. Will be generated if it does not exists') | |
args = parser.parse_args() | |
# first, get a list of all the paths that should be backed up individually | |
paths_to_backup = get_subpaths(args.backup_path, args.depth) | |
if not paths_to_backup: | |
logging.error('path {} not found or empty'.format(args.backup_path)) | |
for path in paths_to_backup: | |
# compress every path into an unencrypted tar file | |
tarpath = compress(path, args.cache_path, args.compression_algorithm) | |
# this is the md5 hash we will use to check if the local file is | |
# different from the file on S3. we have to do this before encryption | |
# since AES uses an initialization vector (IV), which changes the hash | |
# of an encrypted file, even if you encrypt the same data twice | |
md5_local_filesystem = get_md5(tarpath) | |
# create the S3 key, which is the path the file will be stored in in S3 | |
# this is of format [/prefix]/path/filename | |
# so if backing up /home/myuser/mydir/something, with prefix home, | |
# key will be home//home/myuser/mydir/something.tar.xz on a unix system | |
# on windows this will not happen because os.path uses \\. so we have to | |
# do a little dirty hack here with the replace // at the end | |
s3_key = '{}/{}/{}'.format( | |
args.prefix, | |
os.path.dirname(path).replace('\\', '/'), | |
os.path.basename(tarpath) | |
).replace('//', '/') | |
# s3_keys should not start with a / otherwise you have en empty prefix | |
# it will work but its confusing | |
if s3_key.startswith('/'): | |
s3_key = s3_key[1:] | |
# this method checks if the file actually needs to be reuploaded | |
if needs_reupload(s3_key, md5_local_filesystem, args.bucket_name, args.dynamodb_table_name): | |
logging.info('File needs to be uploaded') | |
# encrypt encrypts the file into a new file with ending .aes using | |
# keyfile as the pass, then deletes the original tar file | |
encrypt(tarpath, args.keyfile) | |
upload(s3_key, tarpath + '.aes', args.bucket_name, md5_local_filesystem, args.dynamodb_table_name) | |
os.remove(tarpath + '.aes') | |
else: | |
logging.info('MD5 on S3 is identical to local version. Skipping') | |
os.remove(tarpath) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment