andy51002000/dataflow example.py

## dataflow example.py
class WordcountOptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument(
          '--input',
          default='gs://dataflow-samples/shakespeare/kinglear.txt',
          help='Path of the file to read from')
      parser.add_argument(
          '--output',
          required=True,
          help='Output file to write results to.')

pipeline_options = PipelineOptions(['--output', './result.txt'])
p = beam.Pipeline(options=pipeline_options,runner=InteractiveRunner())

wordcount_options = pipeline_options.view_as(WordcountOptions)

count = (p
         | 'ReadCollection' >> beam.io.ReadFromText(wordcount_options.input)
         | 'findWord' >> beam.FlatMap(lambda line: re.findall(r'[\w\']+', line.strip(), re.UNICODE))
         | "lower" >> beam.Map(lambda word: word.lower())
         | "lower_count" >> beam.combiners.Count.PerElement())
	class WordcountOptions(PipelineOptions):
	@classmethod
	def _add_argparse_args(cls, parser):
	parser.add_argument(
	'--input',
	default='gs://dataflow-samples/shakespeare/kinglear.txt',
	help='Path of the file to read from')
	parser.add_argument(
	'--output',
	required=True,
	help='Output file to write results to.')

	pipeline_options = PipelineOptions(['--output', './result.txt'])
	p = beam.Pipeline(options=pipeline_options,runner=InteractiveRunner())

	wordcount_options = pipeline_options.view_as(WordcountOptions)

	count = (p
	\| 'ReadCollection' >> beam.io.ReadFromText(wordcount_options.input)
	\| 'findWord' >> beam.FlatMap(lambda line: re.findall(r'[\w\']+', line.strip(), re.UNICODE))
	\| "lower" >> beam.Map(lambda word: word.lower())
	\| "lower_count" >> beam.combiners.Count.PerElement())