ebuildy/simple-pyspark-argo-workflow.yaml

## simple-pyspark-argo-workflow.yaml
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
  generateName: pyspark-
spec:
  entrypoint: pyspark

  templates:
  - name: pyspark
    inputs:
      ## Great feature, we can create file directly within the workflow definition (djobi allows that too ^^)
      artifacts:
      - name: spark-config
        path: /opt/spark/conf/spark-defaults.conf
        raw:
          data: |
            spark.hadoop.dfs.nameservices: XXXXX:8020
            spark.hadoop.fs.defaultFS: hdfs://XXXXX:8020
            spark.hadoop.hive.metastore.uris: thrift://XXXXX:9083
            spark.hive.metastore.uris: thrift://XXXXX:9083
            spark.eventLog.enabled: false
    metadata:
      ## Some labels to pass network-policy
      labels:
        datatok.io/infra.kube-dns: allow
        datatok.io/infra.kubernetes-api: allow
        datatok.io/infra.hadoop-hdfs: allow
        app.kubernetes.io/component: workflow
        app.kubernetes.io/part-of: data-lake
    script:
      ## We use the image from https://github.com/datatok/docker-spark
      image: ghcr.io/datatok/spark:v3.3.0-edge
      ## Script is a python script
      command: ["python3"]
      source: |
        from pyspark.sql import SparkSession
        ## Create a spark session
        spark = SparkSession.builder.getOrCreate()
        ## Read the file from HDFS (the default FS)
        df = spark.read.parquet("/user/datatok/year=2022/month=10/day=10")
        ## Print the schema
        df.printSchema()
      ## We must tell to python to use these lib (ok, this should be done within the Docker image ...)
      env:
      - name: PYTHONPATH
        value: /opt/spark/python/lib/py4j-0.10.9.5-src.zip:/opt/spark/python/lib/pyspark.zip
	apiVersion: argoproj.io/v1alpha1
	kind: Workflow
	metadata:
	generateName: pyspark-
	spec:
	entrypoint: pyspark

	templates:
	- name: pyspark
	inputs:
	## Great feature, we can create file directly within the workflow definition (djobi allows that too ^^)
	artifacts:
	- name: spark-config
	path: /opt/spark/conf/spark-defaults.conf
	raw:
	data: \|
	spark.hadoop.dfs.nameservices: XXXXX:8020
	spark.hadoop.fs.defaultFS: hdfs://XXXXX:8020
	spark.hadoop.hive.metastore.uris: thrift://XXXXX:9083
	spark.hive.metastore.uris: thrift://XXXXX:9083
	spark.eventLog.enabled: false
	metadata:
	## Some labels to pass network-policy
	labels:
	datatok.io/infra.kube-dns: allow
	datatok.io/infra.kubernetes-api: allow
	datatok.io/infra.hadoop-hdfs: allow
	app.kubernetes.io/component: workflow
	app.kubernetes.io/part-of: data-lake
	script:
	## We use the image from https://github.com/datatok/docker-spark
	image: ghcr.io/datatok/spark:v3.3.0-edge
	## Script is a python script
	command: ["python3"]
	source: \|
	from pyspark.sql import SparkSession
	## Create a spark session
	spark = SparkSession.builder.getOrCreate()
	## Read the file from HDFS (the default FS)
	df = spark.read.parquet("/user/datatok/year=2022/month=10/day=10")
	## Print the schema
	df.printSchema()
	## We must tell to python to use these lib (ok, this should be done within the Docker image ...)
	env:
	- name: PYTHONPATH
	value: /opt/spark/python/lib/py4j-0.10.9.5-src.zip:/opt/spark/python/lib/pyspark.zip