Matt Faus MattFaus

## appengine_config.py
import db_util
db_util.enable_db_protobuf_projection()
db_util.enable_ndb_protobuf_projection()

## keybase.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                MattFaus
                / keybase.md
            
            
              Created
              November 17, 2014 19:35
            
              
                Verification of my keybase public key
              
          
    Keybase proof

I hereby claim:

I am mattfaus on github.
I am mattfaus (https://keybase.io/mattfaus) on keybase.
I have a public key whose fingerprint is 1CF5 6643 9369 2689 9402  2358 69E8 0354 58E5 E154

To claim this, I am signing this object:

  
## BatchedGcsCsvShardFileWriter.py
class BatchedGcsCsvShardFileWriter(object):
    """Writes CSV data into multiple output shards, grouping rows by keys.

    This class is a context manager, which closes all shards upon exit.

    Say you are writing a lot of CSV data, like:

        [0, "Bakery"],
        [2, "Francisco"],
        [3, "Matt"],

## SortedGcsCsvShardFileMergeReader.py
class SortedGcsCsvShardFileMergeReader(object):
    """Merges several sorted .csv files stored on GCS.

    This class is both an iterator and a context manager.

    Let's say there are 2 .csv files stored on GCS, with contents like:

    /bucket/file_1.csv:
        [0, "Matt"],
        [0, "Sam"],

## ParallelInMemorySortGcsCsvShardFiles.py
class ParallelInMemorySortGcsCsvShardFiles(pipeline.Pipeline):

    def run(self, input_bucket, input_pattern, sort_columns,
            model_type, output_bucket, output_pattern):
        """Sorts each input file in-memory, then writes it to an output file.

        Arguments:
            input_bucket - The GCS bucket which contains the unsorted .csv
                files.
            input_pattern - A regular expression used to find files in the

## DeterministicCompressedFeatures.py
class DeterministicCompressedFeatures(CompressedFeatures):
    """Generates random components after seeding with the component_key.

    By using a known seed to generate the random components, we do not need to
    store or manage them. We can just recompute them whenever we need.
    """

    def __init__(self, num_features=RANDOM_FEATURE_LENGTH):
        super(DeterministicallyRandomFeatures, self).__init__(num_features)

## 2014_05_31_transformed.Video.json
{
    u 'fields': [{
        u 'type': u 'STRING',
        u 'name': u 'playlists',
        u 'mode': u 'REPEATED'
    }, {
        u 'type': u 'STRING',
        u 'name': u 'source_table',
        u 'mode': u 'NULLABLE'
    }, {

## bq_connection.py
def get_table_schema(dataset, table):
    """If the table exists, returns its schema. Otherwise, returns None."""
    table_service = BigQueryService.get_service().tables()
    try:
        get_result = table_service.get(
            projectId=BQ_PROJECT_ID,
            datasetId=dataset,
            tableId=table
        ).execute()
        return get_result['schema']

## advanced_mapreduce.py
import collections
import jinja2
import logging
import os
import request_handler
import third_party.mapreduce
import third_party.mapreduce.input_readers
import third_party.mapreduce.output_writers
import third_party.mapreduce.lib.files
import third_party.mapreduce.operation

## custom_bq_transformers.py
class TransformedVideoTranslationInfo(bq_property_transform.TransformedEntity):

    CUSTOM_SCHEMAS = {
        'translated_youtube_ids': {
            'name': 'translated_youtube_ids',
            'type': 'record',
            'mode': 'repeated',
            'fields': [
                {'name': 'language',
                 'type': 'string'},
	import db_util
	db_util.enable_db_protobuf_projection()
	db_util.enable_ndb_protobuf_projection()
	class BatchedGcsCsvShardFileWriter(object):
	"""Writes CSV data into multiple output shards, grouping rows by keys.

	This class is a context manager, which closes all shards upon exit.

	Say you are writing a lot of CSV data, like:

	[0, "Bakery"],
	[2, "Francisco"],
	[3, "Matt"],
	class SortedGcsCsvShardFileMergeReader(object):
	"""Merges several sorted .csv files stored on GCS.

	This class is both an iterator and a context manager.

	Let's say there are 2 .csv files stored on GCS, with contents like:

	/bucket/file_1.csv:
	[0, "Matt"],
	[0, "Sam"],
	class ParallelInMemorySortGcsCsvShardFiles(pipeline.Pipeline):

	def run(self, input_bucket, input_pattern, sort_columns,
	model_type, output_bucket, output_pattern):
	"""Sorts each input file in-memory, then writes it to an output file.

	Arguments:
	input_bucket - The GCS bucket which contains the unsorted .csv
	files.
	input_pattern - A regular expression used to find files in the
	class DeterministicCompressedFeatures(CompressedFeatures):
	"""Generates random components after seeding with the component_key.

	By using a known seed to generate the random components, we do not need to
	store or manage them. We can just recompute them whenever we need.
	"""

	def __init__(self, num_features=RANDOM_FEATURE_LENGTH):
	super(DeterministicallyRandomFeatures, self).__init__(num_features)
	{
	u 'fields': [{
	u 'type': u 'STRING',
	u 'name': u 'playlists',
	u 'mode': u 'REPEATED'
	}, {
	u 'type': u 'STRING',
	u 'name': u 'source_table',
	u 'mode': u 'NULLABLE'
	}, {
	def get_table_schema(dataset, table):
	"""If the table exists, returns its schema. Otherwise, returns None."""
	table_service = BigQueryService.get_service().tables()
	try:
	get_result = table_service.get(
	projectId=BQ_PROJECT_ID,
	datasetId=dataset,
	tableId=table
	).execute()
	return get_result['schema']
	import collections
	import jinja2
	import logging
	import os
	import request_handler
	import third_party.mapreduce
	import third_party.mapreduce.input_readers
	import third_party.mapreduce.output_writers
	import third_party.mapreduce.lib.files
	import third_party.mapreduce.operation
	class TransformedVideoTranslationInfo(bq_property_transform.TransformedEntity):

	CUSTOM_SCHEMAS = {
	'translated_youtube_ids': {
	'name': 'translated_youtube_ids',
	'type': 'record',
	'mode': 'repeated',
	'fields': [
	{'name': 'language',
	'type': 'string'},