Skip to content

Instantly share code, notes, and snippets.

@ibebrett
Created May 8, 2014 23:20
Show Gist options
  • Save ibebrett/01d67fe8b340c1ff7f78 to your computer and use it in GitHub Desktop.
Save ibebrett/01d67fe8b340c1ff7f78 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import logging
import os
import random
import re
import simplejson as json
import string
import subprocess
import sys
from datetime import datetime, timedelta
import scrapepipeline.cache as cache
import scrapepipeline.cache.aggarchive as aggarchive
import scrapepipeline.cache.compete as compete
import scrapepipeline.cache.ghostery as ghostery
import scrapepipeline.cache.tagcache as tagcache
def rand(n):
return ''.join(random.choice(string.lowercase + string.digits) for i in xrange(n))
def run_cmd(cmd):
subprocess.check_call(cmd)
def main(prefix, region, archive_date, email=False):
prefix_dir = '/mnt/cache_files/%s' % prefix
# parse the archive date
start_date = archive_date
end_date = start_date + timedelta(days=1)
#tagcache.main(prefix, use_global_cache=True)
#ghostery.fetch_ghostery_bugs(prefix)
#ghostery.get_ghostery_tag_classifications_from_s3(prefix)
#compete.fetch_hostname_multipliers(prefix, COMPETE_START_DATE, COMPETE_END_DATE)
# download the archive file from s3
#archive_s3_url = cache.get_cache_s3_filename(None, start_date, end_date, cache.ARCHIVE_TYPE, region=region)
archive_filepath = cache.get_cache_filename(prefix, start_date, end_date, cache.ARCHIVE_TYPE, region=region)
#archive_gz_filepath = archive_filepath + '.gz'
#run_cmd(['s3cmd', 'get', archive_s3_url, archive_gz_filepath])
# gunzip it
#run_cmd(['gunzip', archive_gz_filepath])
# verify it
header = cache.read_cache_header(archive_filepath)
cache.verify_cache(header, 'archive')
start_date = header['cache']['start']
end_date = header['cache']['end']
# build all caches
mobile = 'mobile' in region
caches = aggarchive.main(archive_filepath, prefix, send_comparison_email=email, aggregate_mobile=mobile)
ret = {'caches': caches}
# write the results out to a file or stdout, depending on what was requested
intermediate_outfile = os.path.join(prefix_dir, 'build_caches_intermediate.json')
if not intermediate_outfile:
json.dump(ret, sys.stdout, cls=SetEncoder)
else:
logging.info("Writing results json to: %s", intermediate_outfile)
with open(intermediate_outfile, 'w') as f:
json.dump(ret, f, cls=SetEncoder)
# return to the caller
return ret
class SetEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set):
return list(obj)
return json.JSONEncoder.default(self, obj)
def go(region, prefix, date):
caches = main(prefix, region, date, False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment