Created
August 6, 2014 15:29
-
-
Save danlmarmot/49544de4f2bc708e2770 to your computer and use it in GitHub Desktop.
Download files from S3 if they've been updated (ie, have new timestamps on S3 compared to what's saved locally). Filenames are expected to be the same, and updated on a regular basis.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
import sys | |
import logging | |
import time | |
import json | |
from time import mktime | |
from boto.s3.connection import S3Connection | |
from boto.exception import S3DataError, S3ResponseError | |
# Dry run, useful for testing | |
DRY_RUN = False | |
# S3 bucket and list of object paths | |
S3_BUCKET = "s3_bucket" | |
S3_FILES = ["path/to/objectfile1", "path/to/objectfile2"] | |
# Local download directory | |
DOWNLOAD_DIR = "/tmp" | |
DOWNLOADS_STATUS_FILE = "/tmp/s3_downloads.json" | |
DOWNLOADS_LOG_FILE = "/tmp/s3_download_log.txt" | |
# AWS default creds | |
# todo: remove these after adding an IAM role with access to S3 buckets | |
AWS_ACCESS_KEY = "AKIAMYACCESSKEYTOAWS" | |
AWS_SECRET_KEY = "MyOwnSecretSecretSecretKeyToMyAwsAccount" | |
LOGFILE = os.path.join(DOWNLOAD_DIR, DOWNLOADS_LOG_FILE) | |
# Logging set to CRITICAL, ERROR, WARNING, INFO, DEBUG. Use capitalization | |
# DEBUG will also give AWS connection info from Boto, so INFO is generally best | |
LOGLEVEL = logging.INFO | |
def main(): | |
logging.basicConfig( | |
filename=LOGFILE, | |
level=LOGLEVEL, | |
format='%(levelname)s, %(asctime)s: %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S') | |
download_from_s3() | |
logging.info("Download process complete\n") | |
def download_from_s3(): | |
ret_code = 0 | |
ret_data = dict() | |
bucket = None | |
try: | |
conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY) | |
bucket = conn.get_bucket(S3_BUCKET) | |
except (S3DataError, S3ResponseError) as e: | |
logging.info(str(e)) | |
raise SystemExit("Error connecting to S3: " + str(e)) | |
download_stats = read_download_status() | |
for s3_file in S3_FILES: | |
result_code, result_data = download_obj_from_s3(bucket, s3_file, local_info=download_stats.get(s3_file, {})) | |
# Update our local stats file if download was successful | |
if result_code == 0: | |
download_stats[s3_file] = result_data["local_info"] | |
# Write out updated stats file | |
write_download_status(download_stats) | |
conn.close() | |
return ret_code, ret_data | |
def download_obj_from_s3(bucket, s3_obj, local_info): | |
ret_code = 0 | |
ret_data = dict() | |
logging.info("Attempting download for " + s3_obj) | |
try: | |
file_key = bucket.get_key(s3_obj) | |
except (S3DataError, S3ResponseError) as e: | |
logging.info(str(e)) | |
ret_code = 1 | |
ret_data["error"] = str(e) | |
return ret_code, ret_data | |
local_file_timestamp = local_info.get("timestamp", "0") | |
logging.info(" " + local_file_timestamp + " is local file timestamp") | |
# S3 gives the date in a funky format. Convert it to a string epoch time | |
modified = time.strptime(file_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z') | |
s3_obj_timestamp = str(int(mktime(modified))) | |
logging.info(" " + s3_obj_timestamp + " is S3 object timestamp") | |
# Return early if file exists and timestamps are same and file exists | |
# This is a simple check and not intended to be robust | |
if local_file_timestamp == s3_obj_timestamp and os.path.exists(os.path.join(DOWNLOAD_DIR, s3_obj)): | |
logging.info(" File already present, skipping") | |
ret_code = 1 | |
return ret_code, ret_data | |
try: | |
if not DRY_RUN: | |
logging.info("Downloading new file " + s3_obj) | |
file_key.get_contents_to_filename(os.path.join(DOWNLOAD_DIR, s3_obj)) | |
local_info["timestamp"] = s3_obj_timestamp | |
ret_data["local_info"] = local_info | |
else: | |
logging.info("Dry run: would've downloaded file " + s3_obj) | |
except (S3DataError, S3ResponseError) as e: | |
logging.info(str(e)) | |
ret_code = 1 | |
ret_data["error"] = str(e) | |
return ret_code, ret_data | |
def read_download_status(): | |
# read download info from file | |
if not os.path.exists(DOWNLOADS_STATUS_FILE): | |
return {} | |
try: | |
download_info = json.load(file(DOWNLOADS_STATUS_FILE)) | |
except: | |
return {} | |
return download_info | |
def write_download_status(file_info): | |
# save download info to file | |
with open(DOWNLOADS_STATUS_FILE, 'w') as f: | |
json.dump(file_info, f, indent=2, sort_keys=True) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment