ibebrett/build_caches_nos3.py

## build_caches_nos3.py
#!/usr/bin/env python

import logging
import os
import random
import re
import simplejson as json
import string
import subprocess
import sys
from datetime import datetime, timedelta

import scrapepipeline.cache as cache
import scrapepipeline.cache.aggarchive as aggarchive

import scrapepipeline.cache.compete as compete
import scrapepipeline.cache.ghostery as ghostery
import scrapepipeline.cache.tagcache as tagcache

def rand(n):
    return ''.join(random.choice(string.lowercase + string.digits) for i in xrange(n))

def run_cmd(cmd):
    subprocess.check_call(cmd)

def main(prefix, region, archive_date, email=False):
    prefix_dir = '/mnt/cache_files/%s' % prefix

    # parse the archive date
    start_date = archive_date
    end_date = start_date + timedelta(days=1)

    #tagcache.main(prefix, use_global_cache=True)
    #ghostery.fetch_ghostery_bugs(prefix)
    #ghostery.get_ghostery_tag_classifications_from_s3(prefix)
    #compete.fetch_hostname_multipliers(prefix, COMPETE_START_DATE, COMPETE_END_DATE)

    # download the archive file from s3
    #archive_s3_url = cache.get_cache_s3_filename(None, start_date, end_date, cache.ARCHIVE_TYPE, region=region)
    archive_filepath = cache.get_cache_filename(prefix, start_date, end_date, cache.ARCHIVE_TYPE, region=region)
    #archive_gz_filepath = archive_filepath + '.gz'
    #run_cmd(['s3cmd', 'get', archive_s3_url, archive_gz_filepath])

    # gunzip it
    #run_cmd(['gunzip', archive_gz_filepath])

    # verify it
    header = cache.read_cache_header(archive_filepath)
    cache.verify_cache(header, 'archive')
    start_date = header['cache']['start']
    end_date = header['cache']['end']

    # build all caches
    mobile = 'mobile' in region
    caches = aggarchive.main(archive_filepath, prefix, send_comparison_email=email, aggregate_mobile=mobile)
    ret = {'caches': caches}

    # write the results out to a file or stdout, depending on what was requested
    intermediate_outfile = os.path.join(prefix_dir, 'build_caches_intermediate.json')
    if not intermediate_outfile:
        json.dump(ret, sys.stdout, cls=SetEncoder)
    else:
        logging.info("Writing results json to: %s", intermediate_outfile)
        with open(intermediate_outfile, 'w') as f:
            json.dump(ret, f, cls=SetEncoder)

    # return to the caller
    return ret


class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)

def go(region, prefix, date):
    caches = main(prefix, region, date,  False)
	#!/usr/bin/env python

	import logging
	import os
	import random
	import re
	import simplejson as json
	import string
	import subprocess
	import sys
	from datetime import datetime, timedelta

	import scrapepipeline.cache as cache
	import scrapepipeline.cache.aggarchive as aggarchive

	import scrapepipeline.cache.compete as compete
	import scrapepipeline.cache.ghostery as ghostery
	import scrapepipeline.cache.tagcache as tagcache

	def rand(n):
	return ''.join(random.choice(string.lowercase + string.digits) for i in xrange(n))

	def run_cmd(cmd):
	subprocess.check_call(cmd)

	def main(prefix, region, archive_date, email=False):
	prefix_dir = '/mnt/cache_files/%s' % prefix

	# parse the archive date
	start_date = archive_date
	end_date = start_date + timedelta(days=1)

	#tagcache.main(prefix, use_global_cache=True)
	#ghostery.fetch_ghostery_bugs(prefix)
	#ghostery.get_ghostery_tag_classifications_from_s3(prefix)
	#compete.fetch_hostname_multipliers(prefix, COMPETE_START_DATE, COMPETE_END_DATE)

	# download the archive file from s3
	#archive_s3_url = cache.get_cache_s3_filename(None, start_date, end_date, cache.ARCHIVE_TYPE, region=region)
	archive_filepath = cache.get_cache_filename(prefix, start_date, end_date, cache.ARCHIVE_TYPE, region=region)
	#archive_gz_filepath = archive_filepath + '.gz'
	#run_cmd(['s3cmd', 'get', archive_s3_url, archive_gz_filepath])

	# gunzip it
	#run_cmd(['gunzip', archive_gz_filepath])

	# verify it
	header = cache.read_cache_header(archive_filepath)
	cache.verify_cache(header, 'archive')
	start_date = header['cache']['start']
	end_date = header['cache']['end']

	# build all caches
	mobile = 'mobile' in region
	caches = aggarchive.main(archive_filepath, prefix, send_comparison_email=email, aggregate_mobile=mobile)
	ret = {'caches': caches}

	# write the results out to a file or stdout, depending on what was requested
	intermediate_outfile = os.path.join(prefix_dir, 'build_caches_intermediate.json')
	if not intermediate_outfile:
	json.dump(ret, sys.stdout, cls=SetEncoder)
	else:
	logging.info("Writing results json to: %s", intermediate_outfile)
	with open(intermediate_outfile, 'w') as f:
	json.dump(ret, f, cls=SetEncoder)

	# return to the caller
	return ret


	class SetEncoder(json.JSONEncoder):
	def default(self, obj):
	if isinstance(obj, set):
	return list(obj)
	return json.JSONEncoder.default(self, obj)

	def go(region, prefix, date):
	caches = main(prefix, region, date, False)