Skip to content

Instantly share code, notes, and snippets.

@mzhaase

mzhaase/s3_backup.py

Created Mar 12, 2020
Embed
What would you like to do?
"""Locally compresses and encrypts data into cache directory, checks if the data
stored on S3 is different. If yes, uploads new version. Otherwise skips them.
Encryption provided by openssl enc, I do not feel comfortable writing my own
implementation.
Versioning is supposed to be done with s3 versioning
Basic process:
- find all subfolders / files of backup_path up to a depth of depth
this makes it so that folders which do not change often do not get backed up
all the time
- create a tar archive for all of these files, and get an MD5 hash of the
archive
- compare the MD5 hash either with S3 metadata, or, if defined, the hash stored
in dynamo_db_table. dynamodb is recommended because dynamodb lookups are much
cheaper
- if the hash for the file differs, encrypt using openssl aes-256-cbc and a 4KB
keyfile
- upload to S3 using multipart upload with key
[prefix]/backup_path/path/to/file.tar.bz2
Known issues:
If there are no subfolders / files in depth n but in depth n-1 they will not be
backed up. Example:
/a/b/c
/a/b/d
/a/b/e
/a/f
Will back up the following folders:
depth=0: a
depth=1: b, f
depth=2: c, d, e
Requirements:
- boto3
- aws credentials and regions specified in some way, so either ENV, or in
~.aws/config
- openssl command line tool
- S3 bucket
Optional:
- DynamoDB table, pay-per-request, partition key: s3_key
Permissions required:
data "aws_iam_policy_document" "backup_policy" {
statement {
actions = [
"s3:ListAllMyBuckets",
"s3:GetBucketLocation",
]
resources = [
"arn:aws:s3:::*",
]
}
statement {
actions = [
"s3:GetObject*",
"s3:ListBucket",
"s3:PutObject*"
]
resources = [
aws_s3_bucket.bucket.arn,
"${aws_s3_bucket.bucket.arn}/*"
]
}
statement {
actions = [
"dynamoDb:GetItem",
"dynamoDb:PutItem",
"dynamoDb:UpdateItem"
]
resources = [
aws_dynamodb_table.table.arn
]
}
}
Usage:
See help. prefix and dynamodb_table_name are optional. If dynamo_db_table_name
is set, will store md5 in dynamodb. If it is not set, it will use S3 metadata
instead, which is more expensive
Example terraform config:
variable "account_number" {}
variable "backup_users" { type = map }
provider "aws" {
region = xyz
}
resource "aws_s3_bucket" "bucket" {
bucket = "abc"
acl = "private"
region = xyz
server_side_encryption_configuration {
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
versioning {
enabled = true
}
lifecycle_rule {
enabled = true
noncurrent_version_expiration {
days = 180
}
}
tags = {
project = "backup"
}
}
resource "aws_s3_bucket_public_access_block" "public_access_block" {
bucket = aws_s3_bucket.bucket.id
block_public_acls = true
block_public_policy = true
ignore_public_acls = true
restrict_public_buckets = true
}
resource "aws_dynamodb_table" "table" {
name = "backup"
billing_mode = "PAY_PER_REQUEST"
hash_key = "s3_key"
attribute {
name = "s3_key"
type = "S"
}
tags = {
Name = "def"
project = "backup"
}
}
resource "aws_iam_user" "users" {
for_each = var.backup_users
name = "backup_${each.key}"
tags = {
project = "backup"
}
}
data "aws_iam_policy_document" "backup_policy" {
statement {
actions = [
"s3:ListAllMyBuckets",
"s3:GetBucketLocation",
]
resources = [
"arn:aws:s3:::*",
]
}
statement {
actions = [
"s3:GetObject*",
"s3:ListBucket",
"s3:PutObject*"
]
resources = [
aws_s3_bucket.bucket.arn,
"${aws_s3_bucket.bucket.arn}/*"
]
}
statement {
actions = [
"dynamoDb:GetItem",
"dynamoDb:PutItem",
"dynamoDb:UpdateItem"
]
resources = [
aws_dynamodb_table.table.arn
]
}
}
resource "aws_iam_user_policy" "backup" {
for_each = var.backup_users
name = aws_iam_user.users[each.key].name
user = aws_iam_user.users[each.key].name
policy = data.aws_iam_policy_document.backup_policy.json
}
"""
import argparse
import glob
import logging
import os
import sys
import tarfile
from hashlib import md5
from subprocess import check_output
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
logging.basicConfig(level=logging.INFO)
def get_subpaths(path, depth):
"""Returns list of all files and directories
up to depth <depth>
Arguments:
path {str} -- Filesystem path
depth {int} -- Depth to which paths should be returned
Returns:
[type] -- [description]
"""
# shamelessly stolen from stackoverflow user phihag
# https://stackoverflow.com/a/7159726/3785588
glob_pattern = path + '/*' * depth
return glob.glob(glob_pattern)
def compress(path, cache_path, algorithm):
"""Creates a tarfile compressed with algorithm
and the basename of path to cache_path
Arguments:
path {str} -- Path to compress
cache_path {str} -- Path where tar should be stored
algorithm {str} -- Compression algorithm [gzip bz2 lzma]
Returns:
str: Path to tar file
"""
# translate algorithm into modes understood by tarfile
algorithms = {
'gzip': {
'filetype': 'gz',
'mode': 'w:gz'
},
'bzip2': {
'filetype': 'bz2',
'mode': 'w:bz2'
},
'lzma': {
'filetype': 'xz',
'mode': 'w:xz'
}
}
try:
filetype = algorithms[algorithm]['filetype']
mode = algorithms[algorithm]['mode']
except:
logging.error('Unknown compression algorithm: ' + algorithm)
sys.exit(1)
# gives a path like cache_path/somefile.tar.lzma
tarpath = os.path.join(cache_path, os.path.basename(path)) + '.tar.' + filetype
logging.info('TARing {} to {}'.format(path, tarpath))
with tarfile.open(tarpath, mode=mode) as tar:
tar.add(path, arcname=os.path.basename(path))
return tarpath
def get_md5(path):
"""Returns md5 of path
Arguments:
path {str} -- Path to file
Returns:
str -- md5 hash
"""
# shamelessly stolen from quantumSoup
# https://stackoverflow.com/a/3431838/3785588
hash_md5 = md5()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
logging.info('MD5 of {} is {}'.format(path, hash_md5.hexdigest()))
return hash_md5.hexdigest()
def get_md5_from_dynamodb(s3_key, dynamodb_table_name):
"""Returns DynamoDB property md5 of s3_key
Arguments:
s3_key {str} -- S3 key
dynamodb_table_name {str} -- name of dynamodb table
Returns:
str -- md5 hash as stored in DynamoDB, '0' if key does not exist
"""
client = boto3.client('dynamodb')
resp = client.get_item(
TableName = dynamodb_table_name,
Key = {
's3_key': {
'S': s3_key
}
},
AttributesToGet = [ 'md5' ]
)
# it could be that this is the first time uploading this object, in this
# case the partition key s3_key will not exist so we return a dummy md5
if not 'Item' in resp:
md5_s3 = '0'
else:
md5_s3 = resp['Item']['md5']['S']
return md5_s3
def get_md5_from_s3(s3_key, bucket_name):
"""Returns md5 hash stored in S3 metadata
Arguments:
s3_key {str} -- s3 key
bucket_name {str} -- name of bucket
Returns:
str -- md5 hash as stored in S3 metadata, '0' if key does not exist
"""
client = boto3.client('s3')
# we do a head request to get the metadata
try:
head_resp = client.head_object(
Bucket = bucket_name,
Key = s3_key
)
except botocore.exceptions.ClientError as e:
# if the object does not exist, we set the md5 to a dummy value
if e.response['Error']['Code'] == "404":
s3_md5 = '0'
else:
# if it does exist we can extract the md5 hash we set during upload
if 'md5' in head_resp['Metadata']:
s3_md5 = head_resp['Metadata']['md5']
else:
s3_md5 = '0'
return s3_md5
def needs_reupload(s3_key, md5_local_filesystem, bucket_name, dynamodb_table_name):
"""Returns true of the file does either not exist on S3 or the hash is
different
Arguments:
s3_key {str} -- s3 key
md5_local_filesystem {str} -- md5 hash of the local file
bucket_name {str} -- name of target bucket
dynamodb_table_name {str} -- either name of dynnamodb table or empty
string if dynamodb not used
Returns:
[type] -- [description]
"""
if dynamodb_table_name == '':
md5_s3 = get_md5_from_s3(s3_key, bucket_name)
else:
md5_s3 = get_md5_from_dynamodb(s3_key, dynamodb_table_name)
logging.info('MD5 on S3 is: {}'.format(md5_s3))
return md5_local_filesystem != md5_s3
def encrypt(tarpath, keyfile):
"""Encrypts tarpath using keyfile and openssl
Arguments:
tarpath {str} -- path to file to encrypt
keyfile {str} -- path to keyfile will be created if it does not exist
"""
# generate keyfile if it does not exist
if not os.path.exists(keyfile):
logging.info('Keyfile does not exist. Generating 4096 Byte key.')
with open(keyfile, 'wb') as f:
f.write(os.urandom(4096))
# theoretically, one could use AES primitives for encryption but that is
# hard to get right and easy to get wrong, so we just use openssl instead
# this also means you can decrypt your files on the commandline
logging.info('Encrypting')
command = 'openssl enc -aes-256-cbc -pbkdf2 -pass file:{} -in {} -out {}'.format(
keyfile,
tarpath,
tarpath + '.aes'
)
output = check_output(command.split(' '))
logging.debug('OpenSSL output: {}'.format(output))
# after encrypting, remove the old file
os.remove(tarpath)
def upload(s3_key, path, bucket_name, md5_local_filesystem, dynamodb_table_name):
"""Uploads file to S3, supporting multi-part upload
Arguments:
s3_key {str} -- s3 key
path {str} -- path on local storage to upload
bucket_name {str} -- bucket name
md5_local_filesystem {str} -- md5 hash to put into s3 / dynamodb
dynamodb_table_name {str} -- name of the dynamoDB table. will not be
used if empty
"""
s3 = boto3.resource('s3')
# for large files, we are using multipart upload, for this we need to set
# config
transfer_config = TransferConfig(
multipart_threshold = 1024 * 25,
max_concurrency = 10,
multipart_chunksize = 1024 * 25,
use_threads = True
)
logging.info('Uploading {} to s3://{}/{}'.format(
path,
bucket_name,
s3_key
))
s3.meta.client.upload_file(
Bucket = bucket_name,
Config = transfer_config,
Filename = path,
Key = s3_key,
ExtraArgs = {
'ACL': 'private',
'Metadata': { 'md5': md5_local_filesystem },
'StorageClass': 'GLACIER'
}
)
if not dynamodb_table_name == '':
set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name)
def set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name):
"""Updates data on DynamoDB for specific s3_key
Arguments:
s3_key {str} -- s3 key
md5_local_filesystem {str} -- md5 hash to put on dynamodb
dynamodb_table_name {str} -- Name of dynamoDB table
"""
client = boto3.client('dynamodb')
logging.info('Updating DynamoDB entry for {}'.format(s3_key))
resp = client.put_item(
TableName = dynamodb_table_name,
Item = {
's3_key': {
'S': s3_key
},
'md5': {
'S': md5_local_filesystem
}
}
)
if resp['ResponseMetadata']['HTTPStatusCode'] > 399:
logging.error('DynamoDB response code: {}'.format(
resp['ResponseMetadata']['HTTPStatusCode']
))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--backup_path', type=str, required=True, help='Path that should be backed up')
parser.add_argument('--cache_path', type=str, required=True, help='Path that should be backed up')
parser.add_argument('--bucket_name', type=str, required=True, help='Bucket to back up to')
parser.add_argument('--prefix', type=str, default='', help='Optional prefix prepended to all S3 keys')
parser.add_argument('--dynamodb_table_name', type=str, required=False, default='', help='Optional name of DynamoDB table to store metadata')
parser.add_argument('--depth', type=int, required=True, help='How deep to descend into directories before creating archives. Depth of 0 means only backup-path will be compressed, dept of 1 means, every file / folder within backup-path will get its own archive, etc.')
parser.add_argument('--compression_algorithm', type=str, required=False, default='bzip2', choices=['lzma', 'bzip2', 'gzip'], help='Compression algorithm to use.')
parser.add_argument('--keyfile', type=str, required=True, help='Path to keyfile. Will be generated if it does not exists')
args = parser.parse_args()
# first, get a list of all the paths that should be backed up individually
paths_to_backup = get_subpaths(args.backup_path, args.depth)
if not paths_to_backup:
logging.error('path {} not found or empty'.format(args.backup_path))
for path in paths_to_backup:
# compress every path into an unencrypted tar file
tarpath = compress(path, args.cache_path, args.compression_algorithm)
# this is the md5 hash we will use to check if the local file is
# different from the file on S3. we have to do this before encryption
# since AES uses an initialization vector (IV), which changes the hash
# of an encrypted file, even if you encrypt the same data twice
md5_local_filesystem = get_md5(tarpath)
# create the S3 key, which is the path the file will be stored in in S3
# this is of format [/prefix]/path/filename
# so if backing up /home/myuser/mydir/something, with prefix home,
# key will be home//home/myuser/mydir/something.tar.xz on a unix system
# on windows this will not happen because os.path uses \\. so we have to
# do a little dirty hack here with the replace // at the end
s3_key = '{}/{}/{}'.format(
args.prefix,
os.path.dirname(path).replace('\\', '/'),
os.path.basename(tarpath)
).replace('//', '/')
# s3_keys should not start with a / otherwise you have en empty prefix
# it will work but its confusing
if s3_key.startswith('/'):
s3_key = s3_key[1:]
# this method checks if the file actually needs to be reuploaded
if needs_reupload(s3_key, md5_local_filesystem, args.bucket_name, args.dynamodb_table_name):
logging.info('File needs to be uploaded')
# encrypt encrypts the file into a new file with ending .aes using
# keyfile as the pass, then deletes the original tar file
encrypt(tarpath, args.keyfile)
upload(s3_key, tarpath + '.aes', args.bucket_name, md5_local_filesystem, args.dynamodb_table_name)
os.remove(tarpath + '.aes')
else:
logging.info('MD5 on S3 is identical to local version. Skipping')
os.remove(tarpath)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.