Skip to content

Instantly share code, notes, and snippets.

@danlmarmot
Created August 6, 2014 15:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danlmarmot/49544de4f2bc708e2770 to your computer and use it in GitHub Desktop.
Save danlmarmot/49544de4f2bc708e2770 to your computer and use it in GitHub Desktop.
Download files from S3 if they've been updated (ie, have new timestamps on S3 compared to what's saved locally). Filenames are expected to be the same, and updated on a regular basis.
#!/usr/bin/python
import os
import sys
import logging
import time
import json
from time import mktime
from boto.s3.connection import S3Connection
from boto.exception import S3DataError, S3ResponseError
# Dry run, useful for testing
DRY_RUN = False
# S3 bucket and list of object paths
S3_BUCKET = "s3_bucket"
S3_FILES = ["path/to/objectfile1", "path/to/objectfile2"]
# Local download directory
DOWNLOAD_DIR = "/tmp"
DOWNLOADS_STATUS_FILE = "/tmp/s3_downloads.json"
DOWNLOADS_LOG_FILE = "/tmp/s3_download_log.txt"
# AWS default creds
# todo: remove these after adding an IAM role with access to S3 buckets
AWS_ACCESS_KEY = "AKIAMYACCESSKEYTOAWS"
AWS_SECRET_KEY = "MyOwnSecretSecretSecretKeyToMyAwsAccount"
LOGFILE = os.path.join(DOWNLOAD_DIR, DOWNLOADS_LOG_FILE)
# Logging set to CRITICAL, ERROR, WARNING, INFO, DEBUG. Use capitalization
# DEBUG will also give AWS connection info from Boto, so INFO is generally best
LOGLEVEL = logging.INFO
def main():
logging.basicConfig(
filename=LOGFILE,
level=LOGLEVEL,
format='%(levelname)s, %(asctime)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
download_from_s3()
logging.info("Download process complete\n")
def download_from_s3():
ret_code = 0
ret_data = dict()
bucket = None
try:
conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)
bucket = conn.get_bucket(S3_BUCKET)
except (S3DataError, S3ResponseError) as e:
logging.info(str(e))
raise SystemExit("Error connecting to S3: " + str(e))
download_stats = read_download_status()
for s3_file in S3_FILES:
result_code, result_data = download_obj_from_s3(bucket, s3_file, local_info=download_stats.get(s3_file, {}))
# Update our local stats file if download was successful
if result_code == 0:
download_stats[s3_file] = result_data["local_info"]
# Write out updated stats file
write_download_status(download_stats)
conn.close()
return ret_code, ret_data
def download_obj_from_s3(bucket, s3_obj, local_info):
ret_code = 0
ret_data = dict()
logging.info("Attempting download for " + s3_obj)
try:
file_key = bucket.get_key(s3_obj)
except (S3DataError, S3ResponseError) as e:
logging.info(str(e))
ret_code = 1
ret_data["error"] = str(e)
return ret_code, ret_data
local_file_timestamp = local_info.get("timestamp", "0")
logging.info(" " + local_file_timestamp + " is local file timestamp")
# S3 gives the date in a funky format. Convert it to a string epoch time
modified = time.strptime(file_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z')
s3_obj_timestamp = str(int(mktime(modified)))
logging.info(" " + s3_obj_timestamp + " is S3 object timestamp")
# Return early if file exists and timestamps are same and file exists
# This is a simple check and not intended to be robust
if local_file_timestamp == s3_obj_timestamp and os.path.exists(os.path.join(DOWNLOAD_DIR, s3_obj)):
logging.info(" File already present, skipping")
ret_code = 1
return ret_code, ret_data
try:
if not DRY_RUN:
logging.info("Downloading new file " + s3_obj)
file_key.get_contents_to_filename(os.path.join(DOWNLOAD_DIR, s3_obj))
local_info["timestamp"] = s3_obj_timestamp
ret_data["local_info"] = local_info
else:
logging.info("Dry run: would've downloaded file " + s3_obj)
except (S3DataError, S3ResponseError) as e:
logging.info(str(e))
ret_code = 1
ret_data["error"] = str(e)
return ret_code, ret_data
def read_download_status():
# read download info from file
if not os.path.exists(DOWNLOADS_STATUS_FILE):
return {}
try:
download_info = json.load(file(DOWNLOADS_STATUS_FILE))
except:
return {}
return download_info
def write_download_status(file_info):
# save download info to file
with open(DOWNLOADS_STATUS_FILE, 'w') as f:
json.dump(file_info, f, indent=2, sort_keys=True)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment