timhberry/beam-wordcount.py

## beam-wordcount.py
import re
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

# Input and output files should be Cloud Storage locations
# beginning with gs://
input_file = 'gs://<your-bucket-name>/kinglear.txt'
output_path = 'gs://<your-bucket-name>/counts.txt'

# Replace <your-bucket-name>, <your-project-name> and <your-sa-email>
beam_options = PipelineOptions(
    runner='DataflowRunner',
    project='<your-project-name>',
    job_name='wordcount',
    temp_location='gs://<your-bucket-name>/temp',
    region='europe-west2',
    service_account_email='<your-sa-email>',
    use_public_ips=False
)

pipeline = beam.Pipeline(options=beam_options)

(pipeline
| beam.io.ReadFromText(input_file)
| 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
| beam.combiners.Count.PerElement()
| beam.MapTuple(lambda word, count: '%s: %s' % (word,count))
| beam.io.WriteToText(output_path)
)

pipeline.run()
	import re
	import apache_beam as beam
	from apache_beam.options.pipeline_options import PipelineOptions

	# Input and output files should be Cloud Storage locations
	# beginning with gs://
	input_file = 'gs://<your-bucket-name>/kinglear.txt'
	output_path = 'gs://<your-bucket-name>/counts.txt'

	# Replace <your-bucket-name>, <your-project-name> and <your-sa-email>
	beam_options = PipelineOptions(
	runner='DataflowRunner',
	project='<your-project-name>',
	job_name='wordcount',
	temp_location='gs://<your-bucket-name>/temp',
	region='europe-west2',
	service_account_email='<your-sa-email>',
	use_public_ips=False
	)

	pipeline = beam.Pipeline(options=beam_options)

	(pipeline
	\| beam.io.ReadFromText(input_file)
	\| 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
	\| beam.combiners.Count.PerElement()
	\| beam.MapTuple(lambda word, count: '%s: %s' % (word,count))
	\| beam.io.WriteToText(output_path)
	)

	pipeline.run()