Matt Faus MattFaus

## SortedGcsCsvShardFileMergeReader.py
class SortedGcsCsvShardFileMergeReader(object):
    """Merges several sorted .csv files stored on GCS.

    This class is both an iterator and a context manager.

    Let's say there are 2 .csv files stored on GCS, with contents like:

    /bucket/file_1.csv:
        [0, "Matt"],
        [0, "Sam"],

## ContentRevisionsInputReader.py
class ContentRevisionsInputReader(
    third_party.mapreduce.input_readers.InputReader):
    """Mapper that loads the latest commit's revisions from the datastore."""
    def __init__(self, kinds, start, step, current=0):
        # List of content kinds to filter on
        self._kinds = kinds
        # Start index in snapshot array
        self._start = start
        # Stride through the snapshot array
        self._step = step

## appengine_config.py
import db_util
db_util.enable_db_protobuf_projection()
db_util.enable_ndb_protobuf_projection()

## bq_property_transform.py
mr_pipeline = mapreduce_pipeline.MapperPipeline(
    'bq_property_transform',
    'extbackup.bq_property_transform.property_transform_mapper',
    'third_party.mapreduce.input_readers.RecordsReader',
    'third_party.mapreduce.output_writers.FileOutputWriter',
    params={
        'input_reader': {
            'files': list_backup_files(kind, backup_date),
        },
        'output_writer': {

## mixpanel_funnels.py
class MixPanel(object):

    def get_page_view_funnel(self, content_urls):
        # Build up the events array. Each "event" is a step in the funnel
        events = []
        for cu in content_urls:
            events.append({
                "event": "Page View",
                "selector": 'properties["Page View Page"] == "%s"' % (cu,),
            })

## youtube_query_pipeline.py
class YouTubeQueryMasterPipeline(pipeline.Pipeline):

    def run(self, user_ids):
        """Launches worker pipeline jobs to query the YouTube API.

        Arguments:
            user_ids: The user_ids of stored OAuth2 credentials.
        """
        try:
            for user_id in user_ids:

## serverside_appcfg.py
# Here's how I got this to work:
# 1. Download this client library into your GAE project:
#   https://developers.google.com/api-client-library/python/start/installation#appengine
#   https://code.google.com/p/google-api-python-client/downloads/detail?name=google-api-python-client-gae-1.2.zip&can=2&q=
# 2. Copy this file from the GAE SDK installed on your development machine
#   google/appengine/tools/appengine_rpc_httplib2.py
# 3. Modify the import statements as necessary
# 4. Create a secrets.py file that defines a appcfg_refresh_token property
# 5. Obtain the refresh token by
#   Calling appcfg.py list_versions . --oauth2, this will open a browser so you can login with your Google Account

## datetime_mapreduce.py

class UserData(db.Model):
    birthdate = db.DateProperty(indexed=False)

    def compute_birthdate_str(self):
        if self.birthdate:
            return self.birthdate.isoformat()
        return self.birthdate

    # You will probably need to add this to index.yaml

## combiner_spec_test.py

import random
import logging

def map(data):
    try:
        # Generate a random key from 1..10
        key = random.randint(1, 10)
        logging.info("%s %s", key, data)
        yield (key, data)

## SimpleGaeMapReduce.py
class DataWriter(object):
    """Used by the QueryDrainerPipeline to coalesce intermediate results
    into their final resting place.
    """

    def write_metadata(self, metadata):
        raise NotImplemented()

    def write_result(self, data):
        """Writes all data."""
	class SortedGcsCsvShardFileMergeReader(object):
	"""Merges several sorted .csv files stored on GCS.

	This class is both an iterator and a context manager.

	Let's say there are 2 .csv files stored on GCS, with contents like:

	/bucket/file_1.csv:
	[0, "Matt"],
	[0, "Sam"],
	class ContentRevisionsInputReader(
	third_party.mapreduce.input_readers.InputReader):
	"""Mapper that loads the latest commit's revisions from the datastore."""
	def __init__(self, kinds, start, step, current=0):
	# List of content kinds to filter on
	self._kinds = kinds
	# Start index in snapshot array
	self._start = start
	# Stride through the snapshot array
	self._step = step
	import db_util
	db_util.enable_db_protobuf_projection()
	db_util.enable_ndb_protobuf_projection()
	mr_pipeline = mapreduce_pipeline.MapperPipeline(
	'bq_property_transform',
	'extbackup.bq_property_transform.property_transform_mapper',
	'third_party.mapreduce.input_readers.RecordsReader',
	'third_party.mapreduce.output_writers.FileOutputWriter',
	params={
	'input_reader': {
	'files': list_backup_files(kind, backup_date),
	},
	'output_writer': {
	class MixPanel(object):

	def get_page_view_funnel(self, content_urls):
	# Build up the events array. Each "event" is a step in the funnel
	events = []
	for cu in content_urls:
	events.append({
	"event": "Page View",
	"selector": 'properties["Page View Page"] == "%s"' % (cu,),
	})
	class YouTubeQueryMasterPipeline(pipeline.Pipeline):

	def run(self, user_ids):
	"""Launches worker pipeline jobs to query the YouTube API.

	Arguments:
	user_ids: The user_ids of stored OAuth2 credentials.
	"""
	try:
	for user_id in user_ids:
	# Here's how I got this to work:
	# 1. Download this client library into your GAE project:
	# https://developers.google.com/api-client-library/python/start/installation#appengine
	# https://code.google.com/p/google-api-python-client/downloads/detail?name=google-api-python-client-gae-1.2.zip&can=2&q=
	# 2. Copy this file from the GAE SDK installed on your development machine
	# google/appengine/tools/appengine_rpc_httplib2.py
	# 3. Modify the import statements as necessary
	# 4. Create a secrets.py file that defines a appcfg_refresh_token property
	# 5. Obtain the refresh token by
	# Calling appcfg.py list_versions . --oauth2, this will open a browser so you can login with your Google Account

	class UserData(db.Model):
	birthdate = db.DateProperty(indexed=False)

	def compute_birthdate_str(self):
	if self.birthdate:
	return self.birthdate.isoformat()
	return self.birthdate

	# You will probably need to add this to index.yaml

	import random
	import logging

	def map(data):
	try:
	# Generate a random key from 1..10
	key = random.randint(1, 10)
	logging.info("%s %s", key, data)
	yield (key, data)
	class DataWriter(object):
	"""Used by the QueryDrainerPipeline to coalesce intermediate results
	into their final resting place.
	"""

	def write_metadata(self, metadata):
	raise NotImplemented()

	def write_result(self, data):
	"""Writes all data."""