danlmarmot/s3_download_changed_files.py

## s3_download_changed_files.py
#!/usr/bin/python
import os
import sys
import logging
import time
import json
from time import mktime

from boto.s3.connection import S3Connection
from boto.exception import S3DataError, S3ResponseError

# Dry run, useful for testing
DRY_RUN = False

# S3 bucket and list of object paths
S3_BUCKET = "s3_bucket"
S3_FILES = ["path/to/objectfile1", "path/to/objectfile2"]

# Local download directory
DOWNLOAD_DIR = "/tmp"
DOWNLOADS_STATUS_FILE = "/tmp/s3_downloads.json"
DOWNLOADS_LOG_FILE = "/tmp/s3_download_log.txt"

# AWS default creds
# todo: remove these after adding an IAM role with access to S3 buckets
AWS_ACCESS_KEY = "AKIAMYACCESSKEYTOAWS"
AWS_SECRET_KEY = "MyOwnSecretSecretSecretKeyToMyAwsAccount"

LOGFILE = os.path.join(DOWNLOAD_DIR, DOWNLOADS_LOG_FILE)

# Logging set to CRITICAL, ERROR, WARNING, INFO, DEBUG.  Use capitalization
# DEBUG will also give AWS connection info from Boto, so INFO is generally best
LOGLEVEL = logging.INFO


def main():
    logging.basicConfig(
        filename=LOGFILE,
        level=LOGLEVEL,
        format='%(levelname)s, %(asctime)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')

    download_from_s3()

    logging.info("Download process complete\n")


def download_from_s3():
    ret_code = 0
    ret_data = dict()
    bucket = None

    try:
        conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)
        bucket = conn.get_bucket(S3_BUCKET)
    except (S3DataError, S3ResponseError) as e:
        logging.info(str(e))
        raise SystemExit("Error connecting to S3: " + str(e))

    download_stats = read_download_status()

    for s3_file in S3_FILES:
        result_code, result_data = download_obj_from_s3(bucket, s3_file, local_info=download_stats.get(s3_file, {}))

        # Update our local stats file if download was successful
        if result_code == 0:
            download_stats[s3_file] = result_data["local_info"]

    # Write out updated stats file
    write_download_status(download_stats)

    conn.close()

    return ret_code, ret_data


def download_obj_from_s3(bucket, s3_obj, local_info):
    ret_code = 0
    ret_data = dict()

    logging.info("Attempting download for " + s3_obj)
    try:
        file_key = bucket.get_key(s3_obj)
    except (S3DataError, S3ResponseError) as e:
        logging.info(str(e))
        ret_code = 1
        ret_data["error"] = str(e)
        return ret_code, ret_data

    local_file_timestamp = local_info.get("timestamp", "0")
    logging.info("  " + local_file_timestamp + "   is local file timestamp")

    # S3 gives the date in a funky format.  Convert it to a string epoch time
    modified = time.strptime(file_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z')
    s3_obj_timestamp = str(int(mktime(modified)))
    logging.info("  " + s3_obj_timestamp + "  is S3 object timestamp")

    # Return early if file exists and timestamps are same and file exists
    # This is a simple check and not intended to be robust
    if local_file_timestamp == s3_obj_timestamp and os.path.exists(os.path.join(DOWNLOAD_DIR, s3_obj)):
        logging.info("  File already present, skipping")
        ret_code = 1
        return ret_code, ret_data

    try:
        if not DRY_RUN:
            logging.info("Downloading new file " + s3_obj)
            file_key.get_contents_to_filename(os.path.join(DOWNLOAD_DIR, s3_obj))

            local_info["timestamp"] = s3_obj_timestamp
            ret_data["local_info"] = local_info
        else:
            logging.info("Dry run: would've downloaded file " + s3_obj)

    except (S3DataError, S3ResponseError) as e:
        logging.info(str(e))
        ret_code = 1
        ret_data["error"] = str(e)

    return ret_code, ret_data


def read_download_status():
    # read download info from file
    if not os.path.exists(DOWNLOADS_STATUS_FILE):
        return {}

    try:
        download_info = json.load(file(DOWNLOADS_STATUS_FILE))
    except:
        return {}

    return download_info


def write_download_status(file_info):
    # save download info to file
    with open(DOWNLOADS_STATUS_FILE, 'w') as f:
        json.dump(file_info, f, indent=2, sort_keys=True)


if __name__ == '__main__':
    main()
	#!/usr/bin/python
	import os
	import sys
	import logging
	import time
	import json
	from time import mktime

	from boto.s3.connection import S3Connection
	from boto.exception import S3DataError, S3ResponseError

	# Dry run, useful for testing
	DRY_RUN = False

	# S3 bucket and list of object paths
	S3_BUCKET = "s3_bucket"
	S3_FILES = ["path/to/objectfile1", "path/to/objectfile2"]

	# Local download directory
	DOWNLOAD_DIR = "/tmp"
	DOWNLOADS_STATUS_FILE = "/tmp/s3_downloads.json"
	DOWNLOADS_LOG_FILE = "/tmp/s3_download_log.txt"

	# AWS default creds
	# todo: remove these after adding an IAM role with access to S3 buckets
	AWS_ACCESS_KEY = "AKIAMYACCESSKEYTOAWS"
	AWS_SECRET_KEY = "MyOwnSecretSecretSecretKeyToMyAwsAccount"

	LOGFILE = os.path.join(DOWNLOAD_DIR, DOWNLOADS_LOG_FILE)

	# Logging set to CRITICAL, ERROR, WARNING, INFO, DEBUG. Use capitalization
	# DEBUG will also give AWS connection info from Boto, so INFO is generally best
	LOGLEVEL = logging.INFO


	def main():
	logging.basicConfig(
	filename=LOGFILE,
	level=LOGLEVEL,
	format='%(levelname)s, %(asctime)s: %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S')

	download_from_s3()

	logging.info("Download process complete\n")


	def download_from_s3():
	ret_code = 0
	ret_data = dict()
	bucket = None

	try:
	conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)
	bucket = conn.get_bucket(S3_BUCKET)
	except (S3DataError, S3ResponseError) as e:
	logging.info(str(e))
	raise SystemExit("Error connecting to S3: " + str(e))

	download_stats = read_download_status()

	for s3_file in S3_FILES:
	result_code, result_data = download_obj_from_s3(bucket, s3_file, local_info=download_stats.get(s3_file, {}))

	# Update our local stats file if download was successful
	if result_code == 0:
	download_stats[s3_file] = result_data["local_info"]

	# Write out updated stats file
	write_download_status(download_stats)

	conn.close()

	return ret_code, ret_data


	def download_obj_from_s3(bucket, s3_obj, local_info):
	ret_code = 0
	ret_data = dict()

	logging.info("Attempting download for " + s3_obj)
	try:
	file_key = bucket.get_key(s3_obj)
	except (S3DataError, S3ResponseError) as e:
	logging.info(str(e))
	ret_code = 1
	ret_data["error"] = str(e)
	return ret_code, ret_data

	local_file_timestamp = local_info.get("timestamp", "0")
	logging.info(" " + local_file_timestamp + " is local file timestamp")

	# S3 gives the date in a funky format. Convert it to a string epoch time
	modified = time.strptime(file_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z')
	s3_obj_timestamp = str(int(mktime(modified)))
	logging.info(" " + s3_obj_timestamp + " is S3 object timestamp")

	# Return early if file exists and timestamps are same and file exists
	# This is a simple check and not intended to be robust
	if local_file_timestamp == s3_obj_timestamp and os.path.exists(os.path.join(DOWNLOAD_DIR, s3_obj)):
	logging.info(" File already present, skipping")
	ret_code = 1
	return ret_code, ret_data

	try:
	if not DRY_RUN:
	logging.info("Downloading new file " + s3_obj)
	file_key.get_contents_to_filename(os.path.join(DOWNLOAD_DIR, s3_obj))

	local_info["timestamp"] = s3_obj_timestamp
	ret_data["local_info"] = local_info
	else:
	logging.info("Dry run: would've downloaded file " + s3_obj)

	except (S3DataError, S3ResponseError) as e:
	logging.info(str(e))
	ret_code = 1
	ret_data["error"] = str(e)

	return ret_code, ret_data


	def read_download_status():
	# read download info from file
	if not os.path.exists(DOWNLOADS_STATUS_FILE):
	return {}

	try:
	download_info = json.load(file(DOWNLOADS_STATUS_FILE))
	except:
	return {}

	return download_info


	def write_download_status(file_info):
	# save download info to file
	with open(DOWNLOADS_STATUS_FILE, 'w') as f:
	json.dump(file_info, f, indent=2, sort_keys=True)


	if __name__ == '__main__':
	main()