mik-laj/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Dataflow jobs

Java

gcloud builds submit \
    --config=cloudbuild-java.yaml \
    --substitutions=_APACHE_BEAM_VERSION="2.22.0",_PIPELINE_WAIT_UNTIL_FINISH=false
Python

gcloud builds submit \
    --config=cloudbuild-python.yaml \
    --substitutions=_APACHE_BEAM_VERSION="2.22.0,_PIPELINE_WAIT_UNTIL_FINISH=false"

  
## cloudbuild-java.yaml
steps:
  - name: gcr.io/cloud-builders/git
    args: ['clone', 'https://github.com/GoogleCloudPlatform/java-docs-samples.git', 'repo_dir']
  - name: gcr.io/cloud-builders/git
    args: ['checkout', '$_GIT_COMMIT']
    dir: 'repo_dir'
  - name: maven:3.6.3-jdk-8
    args:
      - sed
      - -i
      - 's#<beam.version>.\+</beam.version>#<beam.version>${_APACHE_BEAM_VERSION}</beam.version>#'
      - repo_dir/dataflow/templates/pom.xml
  - name: maven:3.6.3-jdk-8
    args:
      - sed
      - -i
      - 's#<maven.compiler.target>.\+</maven.compiler.target>#<maven.compiler.target>1.8</maven.compiler.target>#'
      - repo_dir/dataflow/templates/pom.xml
  - name: maven:3.6.3-jdk-8
    args:
      - sed
      - -i
      - 's#<maven.compiler.source>.\+</maven.compiler.source>#<maven.compiler.source>1.8</maven.compiler.source>#'
      - repo_dir/dataflow/templates/pom.xml
  - name: maven:3.6.3-jdk-8
    args:
      - cat
      - repo_dir/dataflow/templates/pom.xml
  - name: maven:3.6.3-jdk-8
    args:
      - bash
      - -c
      - |
        [[ "${_PIPELINE_WAIT_UNTIL_FINISH}" == "true" ]] && \
        sed -i "s#pipeline.run();#pipeline.run().waitUntilFinish();#" \
        repo_dir/dataflow/templates/src/main/java/com/example/dataflow/templates/WordCount.java \
        || true
  - name: maven:3.6.3-jdk-8
    args: ["mvn", "clean", "package"]
    dir: 'repo_dir/dataflow/templates'
  - name: maven:3.6.3-jdk-8
    args:
      - java
      - -jar
      - repo_dir/dataflow/templates/target/dataflow-templates-bundled-1.0.jar
      - --project=${PROJECT_ID}
      - --runner=DataflowRunner
      - --tempLocation=gs://test-dataflow-example/temp/
      - --stagingLocation=gs://test-dataflow-example/staging/
      - '--outputBucket=test-dataflow-example-output'
      - '--labels={"airflow-version":"v2-0-0-dev0"}'
      - --region=europe-west3
  - name: gcr.io/google.com/cloudsdktool/cloud-sdk
    args:
      - gsutil
      - cp
      - repo_dir/dataflow/templates/target/dataflow-templates-bundled-1.0.jar
      - 'gs://test-dataflow-example/builds/dataflow-templates-bundled-java=11-beam=${_APACHE_BEAM_VERSION}-wait_until_finish=${_PIPELINE_WAIT_UNTIL_FINISH}.jar'

substitutions:
  _GIT_COMMIT: "e186156021f63bc4ad40bb12ecac11946d60b992"
  _APACHE_BEAM_VERSION: "2.22.0"
  _PIPELINE_WAIT_UNTIL_FINISH: "true"


## cloudbuild-python.yaml
steps:
  - name: 'python:3.7-stretch'
    args:
      - bash
      - -c
      - curl https://raw.githubusercontent.com/apache/beam/v${_APACHE_BEAM_VERSION}/sdks/python/apache_beam/examples/wordcount.py > wordcount.py
  - name: 'python:3.7-stretch'
    args:
      - bash
      - -c
      - |
        [[ "${_PIPELINE_WAIT_UNTIL_FINISH}" == "false" ]] && \
        sed -i "s#result\.wait_until_finish()##" wordcount.py \
        || true
  - name: 'python:3.7-stretch'
    args:
      - bash
      - -c
      - |
        cat wordcount.py

  - name: 'python:3.7-stretch'
    args:
      - bash
      - -c
      - |
        pip install 'apache-beam[gcp]==${_APACHE_BEAM_VERSION}' && \
        python wordcount.py \
          --runner=DataflowRunner \
          --project=polidea-airflow \
          --temp_location=gs://test-dataflow-example/temp/ \
          --staging_location=gs://test-dataflow-example/staging/ \
          --output=gs://test-dataflow-example/output \
          --labels=airflow-version=v2-0-0-dev0 \
          --job_name=start-python-job-local-5bcf3d71 \
          --region=us-central1

substitutions:
  _APACHE_BEAM_VERSION: "2.22.0"
  _PIPELINE_WAIT_UNTIL_FINISH: "true"

## wordcount.py
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""A word-counting workflow."""

from __future__ import absolute_import

import argparse
import logging
import re

from past.builtins import unicode

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions


class WordExtractingDoFn(beam.DoFn):
  """Parse each line of input text into words."""

  def __init__(self):
    self.words_counter = Metrics.counter(self.__class__, 'words')
    self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
    self.word_lengths_dist = Metrics.distribution(
        self.__class__, 'word_len_dist')
    self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')

  def process(self, element):
    """Returns an iterator over the words of this element.

    The element is a line of text.  If the line is blank, note that, too.

    Args:
      element: the element being processed

    Returns:
      The processed element.
    """
    text_line = element.strip()
    if not text_line:
      self.empty_line_counter.inc(1)
    words = re.findall(r'[\w\']+', text_line, re.UNICODE)
    for w in words:
      self.words_counter.inc()
      self.word_lengths_counter.inc(len(w))
      self.word_lengths_dist.update(len(w))
    return words


def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(known_args.input)

  # Count the occurrences of each word.
  def count_ones(word_ones):
    (word, ones) = word_ones
    return (word, sum(ones))

  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(count_ones))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  output = counts | 'format' >> beam.Map(format_result)

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(known_args.output)

  result = p.run()
  result.wait_until_finish()

  # Do not query metrics when creating a template which doesn't run
  if (not hasattr(result, 'has_job')    # direct runner
      or result.has_job):               # not just a template creation
    empty_lines_filter = MetricsFilter().with_name('empty_lines')
    query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
      empty_lines_counter = query_result['counters'][0]
      logging.info('number of empty lines: %d', empty_lines_counter.result)

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
      word_lengths_dist = query_result['distributions'][0]
      logging.info('average word length: %d', word_lengths_dist.result.mean)


if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()
	steps:
	- name: gcr.io/cloud-builders/git
	args: ['clone', 'https://github.com/GoogleCloudPlatform/java-docs-samples.git', 'repo_dir']
	- name: gcr.io/cloud-builders/git
	args: ['checkout', '$_GIT_COMMIT']
	dir: 'repo_dir'
	- name: maven:3.6.3-jdk-8
	args:
	- sed
	- -i
	- 's#<beam.version>.\+</beam.version>#<beam.version>${_APACHE_BEAM_VERSION}</beam.version>#'
	- repo_dir/dataflow/templates/pom.xml
	- name: maven:3.6.3-jdk-8
	args:
	- sed
	- -i
	- 's#<maven.compiler.target>.\+</maven.compiler.target>#<maven.compiler.target>1.8</maven.compiler.target>#'
	- repo_dir/dataflow/templates/pom.xml
	- name: maven:3.6.3-jdk-8
	args:
	- sed
	- -i
	- 's#<maven.compiler.source>.\+</maven.compiler.source>#<maven.compiler.source>1.8</maven.compiler.source>#'
	- repo_dir/dataflow/templates/pom.xml
	- name: maven:3.6.3-jdk-8
	args:
	- cat
	- repo_dir/dataflow/templates/pom.xml
	- name: maven:3.6.3-jdk-8
	args:
	- bash
	- -c
	- \|
	[[ "${_PIPELINE_WAIT_UNTIL_FINISH}" == "true" ]] && \
	sed -i "s#pipeline.run();#pipeline.run().waitUntilFinish();#" \
	repo_dir/dataflow/templates/src/main/java/com/example/dataflow/templates/WordCount.java \
	\|\| true
	- name: maven:3.6.3-jdk-8
	args: ["mvn", "clean", "package"]
	dir: 'repo_dir/dataflow/templates'
	- name: maven:3.6.3-jdk-8
	args:
	- java
	- -jar
	- repo_dir/dataflow/templates/target/dataflow-templates-bundled-1.0.jar
	- --project=${PROJECT_ID}
	- --runner=DataflowRunner
	- --tempLocation=gs://test-dataflow-example/temp/
	- --stagingLocation=gs://test-dataflow-example/staging/
	- '--outputBucket=test-dataflow-example-output'
	- '--labels={"airflow-version":"v2-0-0-dev0"}'
	- --region=europe-west3
	- name: gcr.io/google.com/cloudsdktool/cloud-sdk
	args:
	- gsutil
	- cp
	- repo_dir/dataflow/templates/target/dataflow-templates-bundled-1.0.jar
	- 'gs://test-dataflow-example/builds/dataflow-templates-bundled-java=11-beam=${_APACHE_BEAM_VERSION}-wait_until_finish=${_PIPELINE_WAIT_UNTIL_FINISH}.jar'

	substitutions:
	_GIT_COMMIT: "e186156021f63bc4ad40bb12ecac11946d60b992"
	_APACHE_BEAM_VERSION: "2.22.0"
	_PIPELINE_WAIT_UNTIL_FINISH: "true"
	steps:
	- name: 'python:3.7-stretch'
	args:
	- bash
	- -c
	- curl https://raw.githubusercontent.com/apache/beam/v${_APACHE_BEAM_VERSION}/sdks/python/apache_beam/examples/wordcount.py > wordcount.py
	- name: 'python:3.7-stretch'
	args:
	- bash
	- -c
	- \|
	[[ "${_PIPELINE_WAIT_UNTIL_FINISH}" == "false" ]] && \
	sed -i "s#result\.wait_until_finish()##" wordcount.py \
	\|\| true
	- name: 'python:3.7-stretch'
	args:
	- bash
	- -c
	- \|
	cat wordcount.py

	- name: 'python:3.7-stretch'
	args:
	- bash
	- -c
	- \|
	pip install 'apache-beam[gcp]==${_APACHE_BEAM_VERSION}' && \
	python wordcount.py \
	--runner=DataflowRunner \
	--project=polidea-airflow \
	--temp_location=gs://test-dataflow-example/temp/ \
	--staging_location=gs://test-dataflow-example/staging/ \
	--output=gs://test-dataflow-example/output \
	--labels=airflow-version=v2-0-0-dev0 \
	--job_name=start-python-job-local-5bcf3d71 \
	--region=us-central1

	substitutions:
	_APACHE_BEAM_VERSION: "2.22.0"
	_PIPELINE_WAIT_UNTIL_FINISH: "true"
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	"""A word-counting workflow."""

	from __future__ import absolute_import

	import argparse
	import logging
	import re

	from past.builtins import unicode

	import apache_beam as beam
	from apache_beam.io import ReadFromText
	from apache_beam.io import WriteToText
	from apache_beam.metrics import Metrics
	from apache_beam.metrics.metric import MetricsFilter
	from apache_beam.options.pipeline_options import PipelineOptions
	from apache_beam.options.pipeline_options import SetupOptions


	class WordExtractingDoFn(beam.DoFn):
	"""Parse each line of input text into words."""

	def __init__(self):
	self.words_counter = Metrics.counter(self.__class__, 'words')
	self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
	self.word_lengths_dist = Metrics.distribution(
	self.__class__, 'word_len_dist')
	self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')

	def process(self, element):
	"""Returns an iterator over the words of this element.

	The element is a line of text. If the line is blank, note that, too.

	Args:
	element: the element being processed

	Returns:
	The processed element.
	"""
	text_line = element.strip()
	if not text_line:
	self.empty_line_counter.inc(1)
	words = re.findall(r'[\w\']+', text_line, re.UNICODE)
	for w in words:
	self.words_counter.inc()
	self.word_lengths_counter.inc(len(w))
	self.word_lengths_dist.update(len(w))
	return words


	def run(argv=None):
	"""Main entry point; defines and runs the wordcount pipeline."""
	parser = argparse.ArgumentParser()
	parser.add_argument('--input',
	dest='input',
	default='gs://dataflow-samples/shakespeare/kinglear.txt',
	help='Input file to process.')
	parser.add_argument('--output',
	dest='output',
	required=True,
	help='Output file to write results to.')
	known_args, pipeline_args = parser.parse_known_args(argv)

	# We use the save_main_session option because one or more DoFn's in this
	# workflow rely on global context (e.g., a module imported at module level).
	pipeline_options = PipelineOptions(pipeline_args)
	pipeline_options.view_as(SetupOptions).save_main_session = True
	p = beam.Pipeline(options=pipeline_options)

	# Read the text file[pattern] into a PCollection.
	lines = p \| 'read' >> ReadFromText(known_args.input)

	# Count the occurrences of each word.
	def count_ones(word_ones):
	(word, ones) = word_ones
	return (word, sum(ones))

	counts = (lines
	\| 'split' >> (beam.ParDo(WordExtractingDoFn())
	.with_output_types(unicode))
	\| 'pair_with_one' >> beam.Map(lambda x: (x, 1))
	\| 'group' >> beam.GroupByKey()
	\| 'count' >> beam.Map(count_ones))

	# Format the counts into a PCollection of strings.
	def format_result(word_count):
	(word, count) = word_count
	return '%s: %d' % (word, count)

	output = counts \| 'format' >> beam.Map(format_result)

	# Write the output using a "Write" transform that has side effects.
	# pylint: disable=expression-not-assigned
	output \| 'write' >> WriteToText(known_args.output)

	result = p.run()
	result.wait_until_finish()

	# Do not query metrics when creating a template which doesn't run
	if (not hasattr(result, 'has_job') # direct runner
	or result.has_job): # not just a template creation
	empty_lines_filter = MetricsFilter().with_name('empty_lines')
	query_result = result.metrics().query(empty_lines_filter)
	if query_result['counters']:
	empty_lines_counter = query_result['counters'][0]
	logging.info('number of empty lines: %d', empty_lines_counter.result)

	word_lengths_filter = MetricsFilter().with_name('word_len_dist')
	query_result = result.metrics().query(word_lengths_filter)
	if query_result['distributions']:
	word_lengths_dist = query_result['distributions'][0]
	logging.info('average word length: %d', word_lengths_dist.result.mean)


	if __name__ == '__main__':
	logging.getLogger().setLevel(logging.INFO)
	run()