Skip to content

Instantly share code, notes, and snippets.

@jaketf
Created July 2, 2019 06:48
Show Gist options
  • Save jaketf/7b88578ee232316e38b5095957850795 to your computer and use it in GitHub Desktop.
Save jaketf/7b88578ee232316e38b5095957850795 to your computer and use it in GitHub Desktop.
Demonstrates that PCollections do maintain some order.
"""
Example pipeline to show that PCollection are not written in order.
"""
import os
import numpy as np
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
def _split_data(examples, train_fraction=0.8, val_fraction=0.1):
"""Splits the data into train/validation/test."""
def partition_fn(*_):
random_value = np.random.random()
if random_value < train_fraction:
return 0
if random_value < train_fraction + val_fraction:
return 1
return 2
examples_split = examples | "SplitData" >> beam.Partition(partition_fn, 3)
return zip(['TRAIN', 'VAL', 'TEST'], examples_split)
for i in range(2):
p = beam.Pipeline(options=PipelineOptions())
data = p | beam.Create(range(1000))
data = _split_data(data)
for name, dataset in data:
#pylint: disable=expression-not-assigned
dataset | "Write{}Output".format(name) >> beam.io.WriteToText(
'{}/output-pipeline-run-{}-{}'.format(os.getcwd(), i, name),
file_name_suffix='.txt')
p.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment