Created March 12, 2020 19:55
"""Locally compresses and encrypts data into cache directory, checks if the data
stored on S3 is different. If yes, uploads new version. Otherwise skips them.
Encryption provided by openssl enc, I do not feel comfortable writing my own
Versioning is supposed to be done with s3 versioning
Basic process:
- find all subfolders / files of backup_path up to a depth of depth
this makes it so that folders which do not change often do not get backed up
all the time
- create a tar archive for all of these files, and get an MD5 hash of the
- compare the MD5 hash either with S3 metadata, or, if defined, the hash stored
in dynamo_db_table. dynamodb is recommended because dynamodb lookups are much
- if the hash for the file differs, encrypt using openssl aes-256-cbc and a 4KB
- upload to S3 using multipart upload with key
Known issues:
If there are no subfolders / files in depth n but in depth n-1 they will not be
backed up. Example:
Will back up the following folders:
depth=0: a
depth=1: b, f
depth=2: c, d, e
- boto3
- aws credentials and regions specified in some way, so either ENV, or in
- openssl command line tool
- S3 bucket
- DynamoDB table, pay-per-request, partition key: s3_key
Permissions required:
data "aws_iam_policy_document" "backup_policy" {
statement {
actions = [
resources = [
statement {
actions = [
resources = [
statement {
actions = [
resources = [
See help. prefix and dynamodb_table_name are optional. If dynamo_db_table_name
is set, will store md5 in dynamodb. If it is not set, it will use S3 metadata
instead, which is more expensive
Example terraform config:
variable "account_number" {}
variable "backup_users" { type = map }
provider "aws" {
region = xyz
resource "aws_s3_bucket" "bucket" {
bucket = "abc"
acl = "private"
region = xyz
server_side_encryption_configuration {
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
versioning {
enabled = true
lifecycle_rule {
enabled = true
noncurrent_version_expiration {
days = 180
tags = {
project = "backup"
resource "aws_s3_bucket_public_access_block" "public_access_block" {
bucket =
block_public_acls = true
block_public_policy = true
ignore_public_acls = true
restrict_public_buckets = true
resource "aws_dynamodb_table" "table" {
name = "backup"
billing_mode = "PAY_PER_REQUEST"
hash_key = "s3_key"
attribute {
name = "s3_key"
type = "S"
tags = {
Name = "def"
project = "backup"
resource "aws_iam_user" "users" {
for_each = var.backup_users
name = "backup_${each.key}"
tags = {
project = "backup"
data "aws_iam_policy_document" "backup_policy" {
statement {
actions = [
resources = [
statement {
actions = [
resources = [
statement {
actions = [
resources = [
resource "aws_iam_user_policy" "backup" {
for_each = var.backup_users
name = aws_iam_user.users[each.key].name
user = aws_iam_user.users[each.key].name
policy = data.aws_iam_policy_document.backup_policy.json
import argparse
import glob
import logging
import os
import sys
import tarfile
from hashlib import md5
from subprocess import check_output
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
def get_subpaths(path, depth):
"""Returns list of all files and directories
up to depth <depth>
path {str} -- Filesystem path
depth {int} -- Depth to which paths should be returned
[type] -- [description]
# shamelessly stolen from stackoverflow user phihag
glob_pattern = path + '/*' * depth
return glob.glob(glob_pattern)
def compress(path, cache_path, algorithm):
"""Creates a tarfile compressed with algorithm
and the basename of path to cache_path
path {str} -- Path to compress
cache_path {str} -- Path where tar should be stored
algorithm {str} -- Compression algorithm [gzip bz2 lzma]
str: Path to tar file
# translate algorithm into modes understood by tarfile
algorithms = {
'gzip': {
'filetype': 'gz',
'mode': 'w:gz'
'bzip2': {
'filetype': 'bz2',
'mode': 'w:bz2'
'lzma': {
'filetype': 'xz',
'mode': 'w:xz'
filetype = algorithms[algorithm]['filetype']
mode = algorithms[algorithm]['mode']
logging.error('Unknown compression algorithm: ' + algorithm)
# gives a path like cache_path/somefile.tar.lzma
tarpath = os.path.join(cache_path, os.path.basename(path)) + '.tar.' + filetype'TARing {} to {}'.format(path, tarpath))
with, mode=mode) as tar:
tar.add(path, arcname=os.path.basename(path))
return tarpath
def get_md5(path):
"""Returns md5 of path
path {str} -- Path to file
str -- md5 hash
# shamelessly stolen from quantumSoup
hash_md5 = md5()
with open(path, "rb") as f:
for chunk in iter(lambda:, b""):
hash_md5.update(chunk)'MD5 of {} is {}'.format(path, hash_md5.hexdigest()))
return hash_md5.hexdigest()
def get_md5_from_dynamodb(s3_key, dynamodb_table_name):
"""Returns DynamoDB property md5 of s3_key
s3_key {str} -- S3 key
dynamodb_table_name {str} -- name of dynamodb table
str -- md5 hash as stored in DynamoDB, '0' if key does not exist
client = boto3.client('dynamodb')
resp = client.get_item(
TableName = dynamodb_table_name,
Key = {
's3_key': {
'S': s3_key
AttributesToGet = [ 'md5' ]
# it could be that this is the first time uploading this object, in this
# case the partition key s3_key will not exist so we return a dummy md5
if not 'Item' in resp:
md5_s3 = '0'
md5_s3 = resp['Item']['md5']['S']
return md5_s3
def get_md5_from_s3(s3_key, bucket_name):
"""Returns md5 hash stored in S3 metadata
s3_key {str} -- s3 key
bucket_name {str} -- name of bucket
str -- md5 hash as stored in S3 metadata, '0' if key does not exist
client = boto3.client('s3')
# we do a head request to get the metadata
head_resp = client.head_object(
Bucket = bucket_name,
Key = s3_key
except botocore.exceptions.ClientError as e:
# if the object does not exist, we set the md5 to a dummy value
if e.response['Error']['Code'] == "404":
s3_md5 = '0'
# if it does exist we can extract the md5 hash we set during upload
if 'md5' in head_resp['Metadata']:
s3_md5 = head_resp['Metadata']['md5']
s3_md5 = '0'
return s3_md5
def needs_reupload(s3_key, md5_local_filesystem, bucket_name, dynamodb_table_name):
"""Returns true of the file does either not exist on S3 or the hash is
s3_key {str} -- s3 key
md5_local_filesystem {str} -- md5 hash of the local file
bucket_name {str} -- name of target bucket
dynamodb_table_name {str} -- either name of dynnamodb table or empty
string if dynamodb not used
[type] -- [description]
if dynamodb_table_name == '':
md5_s3 = get_md5_from_s3(s3_key, bucket_name)
md5_s3 = get_md5_from_dynamodb(s3_key, dynamodb_table_name)'MD5 on S3 is: {}'.format(md5_s3))
return md5_local_filesystem != md5_s3
def encrypt(tarpath, keyfile):
"""Encrypts tarpath using keyfile and openssl
tarpath {str} -- path to file to encrypt
keyfile {str} -- path to keyfile will be created if it does not exist
# generate keyfile if it does not exist
if not os.path.exists(keyfile):'Keyfile does not exist. Generating 4096 Byte key.')
with open(keyfile, 'wb') as f:
# theoretically, one could use AES primitives for encryption but that is
# hard to get right and easy to get wrong, so we just use openssl instead
# this also means you can decrypt your files on the commandline'Encrypting')
command = 'openssl enc -aes-256-cbc -pbkdf2 -pass file:{} -in {} -out {}'.format(
tarpath + '.aes'
output = check_output(command.split(' '))
logging.debug('OpenSSL output: {}'.format(output))
# after encrypting, remove the old file
def upload(s3_key, path, bucket_name, md5_local_filesystem, dynamodb_table_name):
"""Uploads file to S3, supporting multi-part upload
s3_key {str} -- s3 key
path {str} -- path on local storage to upload
bucket_name {str} -- bucket name
md5_local_filesystem {str} -- md5 hash to put into s3 / dynamodb
dynamodb_table_name {str} -- name of the dynamoDB table. will not be
used if empty
s3 = boto3.resource('s3')
# for large files, we are using multipart upload, for this we need to set
# config
transfer_config = TransferConfig(
multipart_threshold = 1024 * 25,
max_concurrency = 10,
multipart_chunksize = 1024 * 25,
use_threads = True
)'Uploading {} to s3://{}/{}'.format(
Bucket = bucket_name,
Config = transfer_config,
Filename = path,
Key = s3_key,
ExtraArgs = {
'ACL': 'private',
'Metadata': { 'md5': md5_local_filesystem },
'StorageClass': 'GLACIER'
if not dynamodb_table_name == '':
set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name)
def set_dynamodb_metadata(s3_key, md5_local_filesystem, dynamodb_table_name):
"""Updates data on DynamoDB for specific s3_key
s3_key {str} -- s3 key
md5_local_filesystem {str} -- md5 hash to put on dynamodb
dynamodb_table_name {str} -- Name of dynamoDB table
client = boto3.client('dynamodb')'Updating DynamoDB entry for {}'.format(s3_key))
resp = client.put_item(
TableName = dynamodb_table_name,
Item = {
's3_key': {
'S': s3_key
'md5': {
'S': md5_local_filesystem
if resp['ResponseMetadata']['HTTPStatusCode'] > 399:
logging.error('DynamoDB response code: {}'.format(
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--backup_path', type=str, required=True, help='Path that should be backed up')
parser.add_argument('--cache_path', type=str, required=True, help='Path that should be backed up')
parser.add_argument('--bucket_name', type=str, required=True, help='Bucket to back up to')
parser.add_argument('--prefix', type=str, default='', help='Optional prefix prepended to all S3 keys')
parser.add_argument('--dynamodb_table_name', type=str, required=False, default='', help='Optional name of DynamoDB table to store metadata')
parser.add_argument('--depth', type=int, required=True, help='How deep to descend into directories before creating archives. Depth of 0 means only backup-path will be compressed, dept of 1 means, every file / folder within backup-path will get its own archive, etc.')
parser.add_argument('--compression_algorithm', type=str, required=False, default='bzip2', choices=['lzma', 'bzip2', 'gzip'], help='Compression algorithm to use.')
parser.add_argument('--keyfile', type=str, required=True, help='Path to keyfile. Will be generated if it does not exists')
args = parser.parse_args()
# first, get a list of all the paths that should be backed up individually
paths_to_backup = get_subpaths(args.backup_path, args.depth)
if not paths_to_backup:
logging.error('path {} not found or empty'.format(args.backup_path))
for path in paths_to_backup:
# compress every path into an unencrypted tar file
tarpath = compress(path, args.cache_path, args.compression_algorithm)
# this is the md5 hash we will use to check if the local file is
# different from the file on S3. we have to do this before encryption
# since AES uses an initialization vector (IV), which changes the hash
# of an encrypted file, even if you encrypt the same data twice
md5_local_filesystem = get_md5(tarpath)
# create the S3 key, which is the path the file will be stored in in S3
# this is of format [/prefix]/path/filename
# so if backing up /home/myuser/mydir/something, with prefix home,
# key will be home//home/myuser/mydir/something.tar.xz on a unix system
# on windows this will not happen because os.path uses \\. so we have to
# do a little dirty hack here with the replace // at the end
s3_key = '{}/{}/{}'.format(
os.path.dirname(path).replace('\\', '/'),
).replace('//', '/')
# s3_keys should not start with a / otherwise you have en empty prefix
# it will work but its confusing
if s3_key.startswith('/'):
s3_key = s3_key[1:]
# this method checks if the file actually needs to be reuploaded
if needs_reupload(s3_key, md5_local_filesystem, args.bucket_name, args.dynamodb_table_name):'File needs to be uploaded')
# encrypt encrypts the file into a new file with ending .aes using
# keyfile as the pass, then deletes the original tar file
encrypt(tarpath, args.keyfile)
upload(s3_key, tarpath + '.aes', args.bucket_name, md5_local_filesystem, args.dynamodb_table_name)
os.remove(tarpath + '.aes')
else:'MD5 on S3 is identical to local version. Skipping')
if __name__ == "__main__":
