Skip to content

Instantly share code, notes, and snippets.

@elprup
Created October 12, 2012 09:34
Show Gist options
  • Save elprup/3878387 to your computer and use it in GitHub Desktop.
Save elprup/3878387 to your computer and use it in GitHub Desktop.
sync scribe file from local to amazon s3 (with lzo compress)
#!/usr/bin/python
'''
s3lzocp
Version 0.2 2012-10-22
sync log from local to s3 and lzo it.
make sure lzop and s3cmd is installed.
usage:
s3lzocp.py /path/to/dir s3://bucket-name/path/to/dir
'''
import re
import os
import sys
import random
import datetime
import logging
logging.getLogger().setLevel(logging.INFO)
def list(path):
''' list all files in path'''
for dir_info in os.walk(path):
dir_name, filenames = dir_info[0], dir_info[2]
for name in sorted(filenames):
yield os.path.join(dir_name, name)
def is_file_before_someday(file_path, someday=None):
''' check whether scribe log date before someday'''
if someday == None:
# set default file_date to today
someday = datetime.date.today()
file_info = re.match('^.*-([0-9]+)-([0-9]+)-([0-9]+)_[0-9]+$', file_path)
logging.debug('filecheck: %s, got result %s' % (file_path, file_info))
if file_info is None:
logging.debug('filecheck: fail to fetch date')
return False
try:
file_info = file_info.groups()
year, month, day = int(file_info[0]), int(file_info[1]), int(file_info[2])
except:
logging.debug('filecheck: fail to initial year,month,day')
return False
file_date = datetime.date(year, month, day)
return file_date < someday
def lzo(file_path, output_dir):
'''call lzop command to compress to output dir'''
output_filename = ''.join([random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') for i in xrange(10)])
output_file_path = os.path.join(output_dir, output_filename)
cmd = 'lzop %s -o %s --quiet' % (file_path, output_file_path)
os.system(cmd)
return output_file_path
class S3FilePath(object):
def __init__(self, path, *arg, **kw):
self.path = path
self.file_list = self._get_file_list()
super(S3FilePath, self).__init__(*arg, **kw)
def _get_file_list(self):
logging.info('S3FilePath: reading file list, please wait...')
cmd = 's3cmd ls %s -r' % self.path
response = os.popen(cmd)
file_list = []
for line in response.xreadlines():
try:
file_path = line.strip().split()[3]
except:
continue
file_list.append(file_path)
response.close()
return file_list
def file_existed(self, s3_path):
return s3_path in self.file_list
def file_existed_deprecated(s3_path):
''' check if file existed '''
cmd = 's3cmd ls %s' % s3_path
response = os.popen(cmd)
for line in response.xreadlines():
try:
response_path = line.strip().split()[3]
except:
continue
if response_path == s3_path:
response.close()
return True
response.close()
return False
def s3_sync(file_path, s3_path):
''' sync file to s3 if file not exited in s3'''
cmd = 's3cmd put %s %s' % (file_path, s3_path)
logging.info(cmd)
os.system(cmd)
def absolute_path(complete_path, root_path):
''' fetch absolute path from root_path'''
if root_path[-1] == '/':
# delete surfix /
root_path = root_path[:-1]
logging.debug('abs_path: complete %s, root %s' % (complete_path, root_path))
return complete_path[len(root_path) + 1:]
def delete_file(filename):
os.system('rm -f %s' % filename)
def main(argv):
# s3lzocp.py source_root dest_root
source_root, dest_root = argv[0], argv[1]
temp_dir = '/tmp'
s3fp = S3FilePath(dest_root)
for file_path in list(source_root):
if is_file_before_someday(file_path) == True:
dest_path = os.path.join(dest_root, absolute_path(file_path, source_root) + '.lzo')
if s3fp.file_existed(dest_path):
logging.info('s3_sync: existed file %s, skip' % dest_path)
continue
lzo_temp_path = lzo(file_path, temp_dir)
s3_sync(lzo_temp_path, dest_path)
delete_file(lzo_temp_path)
if __name__ == '__main__':
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment