Skip to content

Instantly share code, notes, and snippets.

@eykd
Created October 26, 2010 15:06
Show Gist options
  • Save eykd/647067 to your computer and use it in GitHub Desktop.
Save eykd/647067 to your computer and use it in GitHub Desktop.
A simple script for caching packages on S3 and building simple HTML indices.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""pycache -- cache a python package from PyPI on S3.
A simple script to collect a cache of packages locally and sync them up to an S3 bucket, using directories as namespaces so that different projects can have different dependencies.
This is just about the simplest thing that could possibly work.
"""
import warnings
warnings.filterwarnings('ignore')
import os
import argparse
import datetime
import time
import mimetypes
import itertools
from paver.easy import path
from setuptools.package_index import PackageIndex
import boto
__cache__ = path("~/.pycache").expanduser().abspath()
__cache__.makedirs()
index = PackageIndex(index_url="http://pypi.it.uwosh.edu/", search_path=[])
html = """<html>
<head><title>Index - {project}</title></head>
<body>
<h1>{project}</h1>
{body}
</body>
</html>
"""
def main(package=None, project=None, sync=False):
if project is not None:
proj_p = __cache__ / project
else:
proj_p = __cache__
proj_p.makedirs()
if package is not None:
tmp = path(os.tmpnam())
tmp.makedirs()
dl = index.download(package, tmp)
if dl is not None:
fn = path(dl)
fn.copy(proj_p / fn.name)
for fn in tmp.listdir():
fn.remove()
tmp.removedirs()
if sync:
buildIndices()
prefix = package
syncer = SyncS3(__cache__)
syncer.sync_s3()
def buildIndices():
for proj_p in itertools.chain((__cache__, ), __cache__.walkdirs()):
links = ('<li><a href="{project}/{file}">{name}</a></li>'.format(project=proj_p.partition(__cache__)[-1],
file=fn.isdir() and (fn.name + '/index.html') or fn.name,
name=fn.name)
for fn in proj_p.listdir() if fn.name != 'index.html')
with open((proj_p / 'index.html'), 'w') as fo:
fo.write(html.format(body="<ul>%s</ul>" % ''.join(links),
project=proj_p.name))
class SyncS3(object):
AWS_ACCESS_KEY_ID = '****************'
AWS_SECRET_ACCESS_KEY = '*****************'
AWS_BUCKET_NAME = 'pypi.yourdomain.org'
FILTER_LIST = []
def __init__(self, directory, prefix=None, do_force=False, verbosity=0):
self.DIRECTORY = directory
self.prefix = prefix
self.do_force = do_force
self.verbosity = verbosity
self.upload_count = 0
self.skip_count = 0
def sync_s3(self):
"""
Walks the media directory and syncs files to S3
"""
bucket, key = self.open_s3()
os.path.walk(self.DIRECTORY, self.upload_s3,
(bucket, key, self.AWS_BUCKET_NAME, self.DIRECTORY))
def open_s3(self):
"""
Opens connection to S3 returning bucket and key
"""
conn = boto.connect_s3(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)
try:
bucket = conn.get_bucket(self.AWS_BUCKET_NAME)
except boto.exception.S3ResponseError:
bucket = conn.create_bucket(self.AWS_BUCKET_NAME)
return bucket, boto.s3.key.Key(bucket)
def upload_s3(self, arg, dirname, names):
"""
This is the callback to os.path.walk and where much of the work happens
"""
bucket, key, bucket_name, root_dir = arg # expand arg tuple
# Skip directories we don't want to sync
if os.path.basename(dirname) in self.FILTER_LIST:
# prevent walk from processing subfiles/subdirs below the ignored one
del names[:]
return
# Later we assume the MEDIA_ROOT ends with a trailing slash
if not root_dir.endswith(os.path.sep):
root_dir = root_dir + os.path.sep
for file in names:
headers = {}
if file in self.FILTER_LIST:
continue # Skip files we don't want to sync
filename = os.path.join(dirname, file)
file_key = filename[len(root_dir):]
if os.path.isdir(filename):
filename = os.path.join(filename, 'index.html')
if not os.path.exists(filename):
continue
if self.prefix:
file_key = '%s/%s' % (self.prefix, file_key)
# Check if file on S3 is older than local file, if so, upload
if not self.do_force:
s3_key = bucket.get_key(file_key)
if s3_key:
s3_datetime = datetime.datetime(*time.strptime(
s3_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z')[0:6])
local_datetime = datetime.datetime.utcfromtimestamp(
os.stat(filename).st_mtime)
if local_datetime < s3_datetime:
self.skip_count += 1
if self.verbosity > 1:
print "File %s hasn't been modified since last " \
"being uploaded" % (file_key)
continue
# File is newer, let's process and upload
if self.verbosity > 0:
print "Uploading %s..." % (file_key)
content_type = mimetypes.guess_type(filename)[0]
if content_type:
headers['Content-Type'] = content_type
file_obj = open(filename, 'rb')
file_size = os.fstat(file_obj.fileno()).st_size
filedata = file_obj.read()
try:
key.name = file_key
key.set_contents_from_string(filedata, headers, replace=True)
key.set_acl('public-read')
except boto.s3.connection.S3CreateError, e:
print "Failed: %s" % e
except Exception, e:
print e
raise
else:
self.upload_count += 1
file_obj.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Cache a python package from PyPI.')
parser.add_argument('-j', '--project', action='store', )
parser.add_argument('-k', '--package', action='store')
parser.add_argument('-s', '--sync', action='store_true')
args = parser.parse_args()
main(project=args.project, package=args.package, sync=args.sync)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment