fozziethebeat/minimal_can_beam_pipeline.py

## minimal_can_beam_pipeline.py
# Requires
#  pip install apache-beam
#  pip install apache-beam[dataframe]
#
# Associated documentation
#   Beam Dataframe API: https://beam.apache.org/releases/pydoc/2.34.0/apache_beam.dataframe.html
#   Beam Dataframe Overview: https://beam.apache.org/documentation/dsls/dataframes/overview/
#   Beam Dataframe Differences: https://beam.apache.org/documentation/dsls/dataframes/differences-from-pandas/


import apache_beam as beam
from apache_beam.dataframe.io import read_parquet
from apache_beam.options.pipeline_options import PipelineOptions

def make_bucketed_demographics(row):
    return row['age'] + ';' + row['race']

with beam.Pipeline(options=PipelineOptions()) as pipeline:
    timeseries = (pipeline | read_parquet(data_file))
    timeseries.assign(bucketed=make_bucketed_demographics)
    timeseries.to_parquet('out.parquet')
	# Requires
	# pip install apache-beam
	# pip install apache-beam[dataframe]
	#
	# Associated documentation
	# Beam Dataframe API: https://beam.apache.org/releases/pydoc/2.34.0/apache_beam.dataframe.html
	# Beam Dataframe Overview: https://beam.apache.org/documentation/dsls/dataframes/overview/
	# Beam Dataframe Differences: https://beam.apache.org/documentation/dsls/dataframes/differences-from-pandas/


	import apache_beam as beam
	from apache_beam.dataframe.io import read_parquet
	from apache_beam.options.pipeline_options import PipelineOptions

	def make_bucketed_demographics(row):
	return row['age'] + ';' + row['race']

	with beam.Pipeline(options=PipelineOptions()) as pipeline:
	timeseries = (pipeline \| read_parquet(data_file))
	timeseries.assign(bucketed=make_bucketed_demographics)
	timeseries.to_parquet('out.parquet')