gregsheremeta/iris-training-pipeline.yaml

## iris-training-pipeline.yaml
# PIPELINE DEFINITION
# Name: iris-training-pipeline
# Inputs:
#    min_max_scaler: bool
#    neighbors: int
#    standard_scaler: bool
components:
  comp-create-dataset:
    executorLabel: exec-create-dataset
    outputDefinitions:
      artifacts:
        iris_dataset:
          artifactType:
            schemaTitle: system.Dataset
            schemaVersion: 0.0.1
  comp-normalize-dataset:
    executorLabel: exec-normalize-dataset
    inputDefinitions:
      artifacts:
        input_iris_dataset:
          artifactType:
            schemaTitle: system.Dataset
            schemaVersion: 0.0.1
      parameters:
        min_max_scaler:
          parameterType: BOOLEAN
        standard_scaler:
          parameterType: BOOLEAN
    outputDefinitions:
      artifacts:
        normalized_iris_dataset:
          artifactType:
            schemaTitle: system.Dataset
            schemaVersion: 0.0.1
  comp-train-model:
    executorLabel: exec-train-model
    inputDefinitions:
      artifacts:
        normalized_iris_dataset:
          artifactType:
            schemaTitle: system.Dataset
            schemaVersion: 0.0.1
      parameters:
        n_neighbors:
          parameterType: NUMBER_INTEGER
    outputDefinitions:
      artifacts:
        model:
          artifactType:
            schemaTitle: system.Model
            schemaVersion: 0.0.1
deploymentSpec:
  executors:
    exec-create-dataset:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - create_dataset
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.6.0'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
          \ && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef create_dataset(iris_dataset: Output[Dataset]):\n    import pandas\
          \ as pd\n\n    csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'\n\
          \    col_names = [\n        'Sepal_Length', 'Sepal_Width', 'Petal_Length',\
          \ 'Petal_Width', 'Labels'\n    ]\n    df = pd.read_csv(csv_url, names=col_names)\n\
          \n    with open(iris_dataset.path, 'w') as f:\n        df.to_csv(f)\n\n"
        image: docker.io/python:3.9.17
    exec-normalize-dataset:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - normalize_dataset
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.6.0'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
          \ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef normalize_dataset(\n    input_iris_dataset: Input[Dataset],\n\
          \    normalized_iris_dataset: Output[Dataset],\n    standard_scaler: bool,\n\
          \    min_max_scaler: bool,\n):\n    if standard_scaler is min_max_scaler:\n\
          \        raise ValueError(\n            'Exactly one of standard_scaler\
          \ or min_max_scaler must be True.')\n\n    import pandas as pd\n    from\
          \ sklearn.preprocessing import MinMaxScaler\n    from sklearn.preprocessing\
          \ import StandardScaler\n\n    with open(input_iris_dataset.path) as f:\n\
          \        df = pd.read_csv(f)\n    labels = df.pop('Labels')\n\n    if standard_scaler:\n\
          \        scaler = StandardScaler()\n    if min_max_scaler:\n        scaler\
          \ = MinMaxScaler()\n\n    df = pd.DataFrame(scaler.fit_transform(df))\n\
          \    df['Labels'] = labels\n    normalized_iris_dataset.metadata['state']\
          \ = \"Normalized\"\n    with open(normalized_iris_dataset.path, 'w') as\
          \ f:\n        df.to_csv(f)\n\n"
        image: docker.io/python:3.9.17
    exec-train-model:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - train_model
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.6.0'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
          \ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef train_model(\n    normalized_iris_dataset: Input[Dataset],\n\
          \    model: Output[Model],\n    n_neighbors: int,\n):\n    import pickle\n\
          \n    import pandas as pd\n    from sklearn.model_selection import train_test_split\n\
          \    from sklearn.neighbors import KNeighborsClassifier\n\n    with open(normalized_iris_dataset.path)\
          \ as f:\n        df = pd.read_csv(f)\n\n    y = df.pop('Labels')\n    X\
          \ = df\n\n    X_train, X_test, y_train, y_test = train_test_split(X, y,\
          \ random_state=0)\n\n    clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
          \    clf.fit(X_train, y_train)\n\n    model.metadata['framework'] = 'scikit-learn'\n\
          \    with open(model.path, 'wb') as f:\n        pickle.dump(clf, f)\n\n"
        image: docker.io/python:3.9.17
pipelineInfo:
  name: iris-training-pipeline
root:
  dag:
    tasks:
      create-dataset:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-create-dataset
        taskInfo:
          name: create-dataset
      normalize-dataset:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-normalize-dataset
        dependentTasks:
        - create-dataset
        inputs:
          artifacts:
            input_iris_dataset:
              taskOutputArtifact:
                outputArtifactKey: iris_dataset
                producerTask: create-dataset
          parameters:
            min_max_scaler:
              runtimeValue:
                constant: false
            standard_scaler:
              runtimeValue:
                constant: true
        taskInfo:
          name: normalize-dataset
      train-model:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-train-model
        dependentTasks:
        - normalize-dataset
        inputs:
          artifacts:
            normalized_iris_dataset:
              taskOutputArtifact:
                outputArtifactKey: normalized_iris_dataset
                producerTask: normalize-dataset
          parameters:
            n_neighbors:
              componentInputParameter: neighbors
        taskInfo:
          name: train-model
  inputDefinitions:
    parameters:
      min_max_scaler:
        parameterType: BOOLEAN
      neighbors:
        parameterType: NUMBER_INTEGER
      standard_scaler:
        parameterType: BOOLEAN
schemaVersion: 2.1.0
sdkVersion: kfp-2.6.0
	# PIPELINE DEFINITION
	# Name: iris-training-pipeline
	# Inputs:
	# min_max_scaler: bool
	# neighbors: int
	# standard_scaler: bool
	components:
	comp-create-dataset:
	executorLabel: exec-create-dataset
	outputDefinitions:
	artifacts:
	iris_dataset:
	artifactType:
	schemaTitle: system.Dataset
	schemaVersion: 0.0.1
	comp-normalize-dataset:
	executorLabel: exec-normalize-dataset
	inputDefinitions:
	artifacts:
	input_iris_dataset:
	artifactType:
	schemaTitle: system.Dataset
	schemaVersion: 0.0.1
	parameters:
	min_max_scaler:
	parameterType: BOOLEAN
	standard_scaler:
	parameterType: BOOLEAN
	outputDefinitions:
	artifacts:
	normalized_iris_dataset:
	artifactType:
	schemaTitle: system.Dataset
	schemaVersion: 0.0.1
	comp-train-model:
	executorLabel: exec-train-model
	inputDefinitions:
	artifacts:
	normalized_iris_dataset:
	artifactType:
	schemaTitle: system.Dataset
	schemaVersion: 0.0.1
	parameters:
	n_neighbors:
	parameterType: NUMBER_INTEGER
	outputDefinitions:
	artifacts:
	model:
	artifactType:
	schemaTitle: system.Model
	schemaVersion: 0.0.1
	deploymentSpec:
	executors:
	exec-create-dataset:
	container:
	args:
	- --executor_input
	- '{{$}}'
	- --function_to_execute
	- create_dataset
	command:
	- sh
	- -c
	- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip \|\|\
	\ python3 -m ensurepip --user \|\| apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
	\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.6.0'\
	\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
	\ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
	\ && \"$0\" \"$@\"\n"
	- sh
	- -ec
	- 'program_path=$(mktemp -d)


	printf "%s" "$0" > "$program_path/ephemeral_component.py"

	_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"

	'
	- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
	\ *\n\ndef create_dataset(iris_dataset: Output[Dataset]):\n import pandas\
	\ as pd\n\n csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'\n\
	\ col_names = [\n 'Sepal_Length', 'Sepal_Width', 'Petal_Length',\
	\ 'Petal_Width', 'Labels'\n ]\n df = pd.read_csv(csv_url, names=col_names)\n\
	\n with open(iris_dataset.path, 'w') as f:\n df.to_csv(f)\n\n"
	image: docker.io/python:3.9.17
	exec-normalize-dataset:
	container:
	args:
	- --executor_input
	- '{{$}}'
	- --function_to_execute
	- normalize_dataset
	command:
	- sh
	- -c
	- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip \|\|\
	\ python3 -m ensurepip --user \|\| apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
	\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.6.0'\
	\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
	\ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
	\ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
	- sh
	- -ec
	- 'program_path=$(mktemp -d)


	printf "%s" "$0" > "$program_path/ephemeral_component.py"

	_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"

	'
	- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
	\ *\n\ndef normalize_dataset(\n input_iris_dataset: Input[Dataset],\n\
	\ normalized_iris_dataset: Output[Dataset],\n standard_scaler: bool,\n\
	\ min_max_scaler: bool,\n):\n if standard_scaler is min_max_scaler:\n\
	\ raise ValueError(\n 'Exactly one of standard_scaler\
	\ or min_max_scaler must be True.')\n\n import pandas as pd\n from\
	\ sklearn.preprocessing import MinMaxScaler\n from sklearn.preprocessing\
	\ import StandardScaler\n\n with open(input_iris_dataset.path) as f:\n\
	\ df = pd.read_csv(f)\n labels = df.pop('Labels')\n\n if standard_scaler:\n\
	\ scaler = StandardScaler()\n if min_max_scaler:\n scaler\
	\ = MinMaxScaler()\n\n df = pd.DataFrame(scaler.fit_transform(df))\n\
	\ df['Labels'] = labels\n normalized_iris_dataset.metadata['state']\
	\ = \"Normalized\"\n with open(normalized_iris_dataset.path, 'w') as\
	\ f:\n df.to_csv(f)\n\n"
	image: docker.io/python:3.9.17
	exec-train-model:
	container:
	args:
	- --executor_input
	- '{{$}}'
	- --function_to_execute
	- train_model
	command:
	- sh
	- -c
	- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip \|\|\
	\ python3 -m ensurepip --user \|\| apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
	\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.6.0'\
	\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
	\ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
	\ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
	- sh
	- -ec
	- 'program_path=$(mktemp -d)


	printf "%s" "$0" > "$program_path/ephemeral_component.py"

	_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"

	'
	- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
	\ *\n\ndef train_model(\n normalized_iris_dataset: Input[Dataset],\n\
	\ model: Output[Model],\n n_neighbors: int,\n):\n import pickle\n\
	\n import pandas as pd\n from sklearn.model_selection import train_test_split\n\
	\ from sklearn.neighbors import KNeighborsClassifier\n\n with open(normalized_iris_dataset.path)\
	\ as f:\n df = pd.read_csv(f)\n\n y = df.pop('Labels')\n X\
	\ = df\n\n X_train, X_test, y_train, y_test = train_test_split(X, y,\
	\ random_state=0)\n\n clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
	\ clf.fit(X_train, y_train)\n\n model.metadata['framework'] = 'scikit-learn'\n\
	\ with open(model.path, 'wb') as f:\n pickle.dump(clf, f)\n\n"
	image: docker.io/python:3.9.17
	pipelineInfo:
	name: iris-training-pipeline
	root:
	dag:
	tasks:
	create-dataset:
	cachingOptions:
	enableCache: true
	componentRef:
	name: comp-create-dataset
	taskInfo:
	name: create-dataset
	normalize-dataset:
	cachingOptions:
	enableCache: true
	componentRef:
	name: comp-normalize-dataset
	dependentTasks:
	- create-dataset
	inputs:
	artifacts:
	input_iris_dataset:
	taskOutputArtifact:
	outputArtifactKey: iris_dataset
	producerTask: create-dataset
	parameters:
	min_max_scaler:
	runtimeValue:
	constant: false
	standard_scaler:
	runtimeValue:
	constant: true
	taskInfo:
	name: normalize-dataset
	train-model:
	cachingOptions:
	enableCache: true
	componentRef:
	name: comp-train-model
	dependentTasks:
	- normalize-dataset
	inputs:
	artifacts:
	normalized_iris_dataset:
	taskOutputArtifact:
	outputArtifactKey: normalized_iris_dataset
	producerTask: normalize-dataset
	parameters:
	n_neighbors:
	componentInputParameter: neighbors
	taskInfo:
	name: train-model
	inputDefinitions:
	parameters:
	min_max_scaler:
	parameterType: BOOLEAN
	neighbors:
	parameterType: NUMBER_INTEGER
	standard_scaler:
	parameterType: BOOLEAN
	schemaVersion: 2.1.0
	sdkVersion: kfp-2.6.0