Skip to content

Instantly share code, notes, and snippets.

@ankona
Last active August 29, 2015 14:23
Show Gist options
  • Save ankona/8b02ea4e86cadfeb7172 to your computer and use it in GitHub Desktop.
Save ankona/8b02ea4e86cadfeb7172 to your computer and use it in GitHub Desktop.
List all the files in an S3 bucket
AWS_ACCESS_KEY_ID = '' # todo: insert aws key.
AWS_ACCESS_KEY_SECRET = '' # todo: insert aws secret.
import os
import boto
import boto.s3
import os.path
import sys, getopt
import logging
from datetime import datetime
s3_search_prefix = ""
working_dir_path = ""
process_start_time = datetime.now()
execute_timestamp = process_start_time.strftime("%Y%m%d.%I%M%p")
if len(sys.argv) == 1:
print 'You must supply arguments. Use -h for help'
quit()
def print_help_text():
print 'file_list_s3.py -p <s3prefix> -w <workdir> -m <mode>'
print "sample: python file_list_s3.py -w /home/ec2-user/s3_puller/ -p 05_21_14_2013_CAIA_Level_1_September -b readyforkapx-ondemand"
try:
opts, args = getopt.getopt(sys.argv[1:], "hp:w:m:b:", ["help", "s3prefix=", "workdir=", "bucket="])
except getopt.GetoptError:
print_help_text()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print_help_text()
sys.exit()
elif opt in ("-p", "--s3prefix"):
s3_search_prefix = arg
elif opt in ("-w", "--workdir"):
working_dir_path = arg
elif opt in ("-b", "--bucket"):
target_bucket = arg
if s3_search_prefix == "":
print ' YOU DID NOT SUPPLY A PREFIX. ALL ITEMS IN THE BUCKET WILL BE INSPECTED! THIS MAY TAKE A LONG TIME!'
if working_dir_path == "":
print 'you must supply the working directory path.'
print_help_text()
quit()
# create the initial path where we'll write out data as we import sessions into KAPx.
if not os.path.exists(working_dir_path):
os.makedirs(working_dir_path)
if not os.path.exists(working_dir_path + "/logs"):
os.makedirs(working_dir_path + "/logs")
log_file_name = execute_timestamp + ".file_list.txt"
err_log_file_name = execute_timestamp + ".file_list.ERRORS.txt"
#logging.basicConfig(filename=working_dir_path + "/logs/" + log_file_name)
logger = logging.getLogger("s3_importer")
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.DEBUG)
sh.setFormatter(formatter)
logger.addHandler(sh)
fh = logging.FileHandler(working_dir_path + "/logs/" + log_file_name)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
simple_error_formatter = logging.Formatter('%(message)s')
fhe = logging.FileHandler(working_dir_path + "/logs/" + err_log_file_name)
fhe.setLevel(logging.WARNING)
fhe.setFormatter(simple_error_formatter)
logger.addHandler(fhe)
logger.setLevel(logging.DEBUG)
logger.info("retrieving data from target s3 bucket: " + target_bucket)
logger.info("s3 searches limited by prefix: " + s3_search_prefix)
logger.info("working directory: " + working_dir_path)
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_KEY_SECRET)
bucket = conn.get_bucket(target_bucket)
def list_files_from_s3(target_write_path):
if s3_search_prefix:
key_list = bucket.list(prefix=s3_search_prefix)
else:
key_list = bucket.list()
item_count = 0
if not os.path.exists(target_write_path):
os.makedirs(target_write_path)
for k in key_list:
logger.info(k.name)
item_count += 1
logger.info("{0} files found.".format(item_count))
try:
list_files_from_s3(working_dir_path)
except Exception, ex:
logger.error(ex)
exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment