Created
October 12, 2012 09:34
-
-
Save elprup/3878387 to your computer and use it in GitHub Desktop.
sync scribe file from local to amazon s3 (with lzo compress)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
''' | |
s3lzocp | |
Version 0.2 2012-10-22 | |
sync log from local to s3 and lzo it. | |
make sure lzop and s3cmd is installed. | |
usage: | |
s3lzocp.py /path/to/dir s3://bucket-name/path/to/dir | |
''' | |
import re | |
import os | |
import sys | |
import random | |
import datetime | |
import logging | |
logging.getLogger().setLevel(logging.INFO) | |
def list(path): | |
''' list all files in path''' | |
for dir_info in os.walk(path): | |
dir_name, filenames = dir_info[0], dir_info[2] | |
for name in sorted(filenames): | |
yield os.path.join(dir_name, name) | |
def is_file_before_someday(file_path, someday=None): | |
''' check whether scribe log date before someday''' | |
if someday == None: | |
# set default file_date to today | |
someday = datetime.date.today() | |
file_info = re.match('^.*-([0-9]+)-([0-9]+)-([0-9]+)_[0-9]+$', file_path) | |
logging.debug('filecheck: %s, got result %s' % (file_path, file_info)) | |
if file_info is None: | |
logging.debug('filecheck: fail to fetch date') | |
return False | |
try: | |
file_info = file_info.groups() | |
year, month, day = int(file_info[0]), int(file_info[1]), int(file_info[2]) | |
except: | |
logging.debug('filecheck: fail to initial year,month,day') | |
return False | |
file_date = datetime.date(year, month, day) | |
return file_date < someday | |
def lzo(file_path, output_dir): | |
'''call lzop command to compress to output dir''' | |
output_filename = ''.join([random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') for i in xrange(10)]) | |
output_file_path = os.path.join(output_dir, output_filename) | |
cmd = 'lzop %s -o %s --quiet' % (file_path, output_file_path) | |
os.system(cmd) | |
return output_file_path | |
class S3FilePath(object): | |
def __init__(self, path, *arg, **kw): | |
self.path = path | |
self.file_list = self._get_file_list() | |
super(S3FilePath, self).__init__(*arg, **kw) | |
def _get_file_list(self): | |
logging.info('S3FilePath: reading file list, please wait...') | |
cmd = 's3cmd ls %s -r' % self.path | |
response = os.popen(cmd) | |
file_list = [] | |
for line in response.xreadlines(): | |
try: | |
file_path = line.strip().split()[3] | |
except: | |
continue | |
file_list.append(file_path) | |
response.close() | |
return file_list | |
def file_existed(self, s3_path): | |
return s3_path in self.file_list | |
def file_existed_deprecated(s3_path): | |
''' check if file existed ''' | |
cmd = 's3cmd ls %s' % s3_path | |
response = os.popen(cmd) | |
for line in response.xreadlines(): | |
try: | |
response_path = line.strip().split()[3] | |
except: | |
continue | |
if response_path == s3_path: | |
response.close() | |
return True | |
response.close() | |
return False | |
def s3_sync(file_path, s3_path): | |
''' sync file to s3 if file not exited in s3''' | |
cmd = 's3cmd put %s %s' % (file_path, s3_path) | |
logging.info(cmd) | |
os.system(cmd) | |
def absolute_path(complete_path, root_path): | |
''' fetch absolute path from root_path''' | |
if root_path[-1] == '/': | |
# delete surfix / | |
root_path = root_path[:-1] | |
logging.debug('abs_path: complete %s, root %s' % (complete_path, root_path)) | |
return complete_path[len(root_path) + 1:] | |
def delete_file(filename): | |
os.system('rm -f %s' % filename) | |
def main(argv): | |
# s3lzocp.py source_root dest_root | |
source_root, dest_root = argv[0], argv[1] | |
temp_dir = '/tmp' | |
s3fp = S3FilePath(dest_root) | |
for file_path in list(source_root): | |
if is_file_before_someday(file_path) == True: | |
dest_path = os.path.join(dest_root, absolute_path(file_path, source_root) + '.lzo') | |
if s3fp.file_existed(dest_path): | |
logging.info('s3_sync: existed file %s, skip' % dest_path) | |
continue | |
lzo_temp_path = lzo(file_path, temp_dir) | |
s3_sync(lzo_temp_path, dest_path) | |
delete_file(lzo_temp_path) | |
if __name__ == '__main__': | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment