Created
July 2, 2019 06:48
-
-
Save jaketf/7b88578ee232316e38b5095957850795 to your computer and use it in GitHub Desktop.
Demonstrates that PCollections do maintain some order.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Example pipeline to show that PCollection are not written in order. | |
""" | |
import os | |
import numpy as np | |
import apache_beam as beam | |
from apache_beam.options.pipeline_options import PipelineOptions | |
def _split_data(examples, train_fraction=0.8, val_fraction=0.1): | |
"""Splits the data into train/validation/test.""" | |
def partition_fn(*_): | |
random_value = np.random.random() | |
if random_value < train_fraction: | |
return 0 | |
if random_value < train_fraction + val_fraction: | |
return 1 | |
return 2 | |
examples_split = examples | "SplitData" >> beam.Partition(partition_fn, 3) | |
return zip(['TRAIN', 'VAL', 'TEST'], examples_split) | |
for i in range(2): | |
p = beam.Pipeline(options=PipelineOptions()) | |
data = p | beam.Create(range(1000)) | |
data = _split_data(data) | |
for name, dataset in data: | |
#pylint: disable=expression-not-assigned | |
dataset | "Write{}Output".format(name) >> beam.io.WriteToText( | |
'{}/output-pipeline-run-{}-{}'.format(os.getcwd(), i, name), | |
file_name_suffix='.txt') | |
p.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment