The mapreduce job we use to transform datastore backups into JSON files that we then load into BigQuery.
mr_pipeline = mapreduce_pipeline.MapperPipeline( | |
'bq_property_transform', | |
'extbackup.bq_property_transform.property_transform_mapper', | |
'third_party.mapreduce.input_readers.RecordsReader', | |
'third_party.mapreduce.output_writers.FileOutputWriter', | |
params={ | |
'input_reader': { | |
'files': list_backup_files(kind, backup_date), | |
}, | |
'output_writer': { | |
'filesystem': 'gs', | |
'mime_type': 'text/plain', | |
'output_sharding': 'input', | |
'gs_bucket_name': get_output_path(kind, backup_date), | |
}, | |
'backup_date': backup_date.encode('UTF-8'), | |
'kind': kind.encode('UTF-8'), | |
}, | |
shards=N_SHARDS, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment