This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class SortedGcsCsvShardFileMergeReader(object): | |
"""Merges several sorted .csv files stored on GCS. | |
This class is both an iterator and a context manager. | |
Let's say there are 2 .csv files stored on GCS, with contents like: | |
/bucket/file_1.csv: | |
[0, "Matt"], | |
[0, "Sam"], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ContentRevisionsInputReader( | |
third_party.mapreduce.input_readers.InputReader): | |
"""Mapper that loads the latest commit's revisions from the datastore.""" | |
def __init__(self, kinds, start, step, current=0): | |
# List of content kinds to filter on | |
self._kinds = kinds | |
# Start index in snapshot array | |
self._start = start | |
# Stride through the snapshot array | |
self._step = step |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import db_util | |
db_util.enable_db_protobuf_projection() | |
db_util.enable_ndb_protobuf_projection() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mr_pipeline = mapreduce_pipeline.MapperPipeline( | |
'bq_property_transform', | |
'extbackup.bq_property_transform.property_transform_mapper', | |
'third_party.mapreduce.input_readers.RecordsReader', | |
'third_party.mapreduce.output_writers.FileOutputWriter', | |
params={ | |
'input_reader': { | |
'files': list_backup_files(kind, backup_date), | |
}, | |
'output_writer': { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class MixPanel(object): | |
def get_page_view_funnel(self, content_urls): | |
# Build up the events array. Each "event" is a step in the funnel | |
events = [] | |
for cu in content_urls: | |
events.append({ | |
"event": "Page View", | |
"selector": 'properties["Page View Page"] == "%s"' % (cu,), | |
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class YouTubeQueryMasterPipeline(pipeline.Pipeline): | |
def run(self, user_ids): | |
"""Launches worker pipeline jobs to query the YouTube API. | |
Arguments: | |
user_ids: The user_ids of stored OAuth2 credentials. | |
""" | |
try: | |
for user_id in user_ids: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Here's how I got this to work: | |
# 1. Download this client library into your GAE project: | |
# https://developers.google.com/api-client-library/python/start/installation#appengine | |
# https://code.google.com/p/google-api-python-client/downloads/detail?name=google-api-python-client-gae-1.2.zip&can=2&q= | |
# 2. Copy this file from the GAE SDK installed on your development machine | |
# google/appengine/tools/appengine_rpc_httplib2.py | |
# 3. Modify the import statements as necessary | |
# 4. Create a secrets.py file that defines a appcfg_refresh_token property | |
# 5. Obtain the refresh token by | |
# Calling appcfg.py list_versions . --oauth2, this will open a browser so you can login with your Google Account |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class UserData(db.Model): | |
birthdate = db.DateProperty(indexed=False) | |
def compute_birthdate_str(self): | |
if self.birthdate: | |
return self.birthdate.isoformat() | |
return self.birthdate | |
# You will probably need to add this to index.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import logging | |
def map(data): | |
try: | |
# Generate a random key from 1..10 | |
key = random.randint(1, 10) | |
logging.info("%s %s", key, data) | |
yield (key, data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class DataWriter(object): | |
"""Used by the QueryDrainerPipeline to coalesce intermediate results | |
into their final resting place. | |
""" | |
def write_metadata(self, metadata): | |
raise NotImplemented() | |
def write_result(self, data): | |
"""Writes all data.""" |
NewerOlder