Skip to content

Instantly share code, notes, and snippets.

@datajoely
Last active May 6, 2024 22:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save datajoely/018607d5d721c747d742605494b822a3 to your computer and use it in GitHub Desktop.
Save datajoely/018607d5d721c747d742605494b822a3 to your computer and use it in GitHub Desktop.
def create_template_pipeline() -> Pipeline:
""" Template declareed here with real inputs, but placeholder outputs and parameters """
return Pipeline(
[
node(
func=create_model_inputs,
inputs=[ # These inputs are never overriden
"feat_days_since_last_shutdown",
"feat_days_between_shutdown_last_maintenance",
"feat_fte_maintenance_hours_last_6m",
],
outputs=["spine", "training_set", "test_set"], # These output placeholders are overriden
tags='data science'
),
node(
func=train_model,
inputs=["spine", "training_set", "model_type"], # These output placeholders are overriden
outputs="model_object",
tags='data science'
),
node(
func=estimate,
inputs=["spine", "model_object", "test_set", "model_params"], # These inputs inherit overriden outputs
outputs="model_output",
tags='data science'
),
]
)
def create_data_science_pipeline(**kwargs) -> Pipeline:
""" Create single modular pipeline by reusing `create_teplate_pipeline()` twice """
return pipeline( # Prediction pipeline
create_template_pipeline(),
inputs={ # Overriden input params
"model_type": "params:model_type.sklearn",
"model_params": "params:hyperparams.sklearn",
},
outputs={ # Overriden output catalog entires
"spine": "spine_time_series",
"training_set": "train_time_series",
"test_set": "test_time_series",
"model_object": "sklearn_predictor",
"model_output": "recommended_maintenance_schedule",
},
) + pipeline( # Classificiation pipeline
create_template_pipeline(),
inputs={ # Prediction pipeline
"model_type": "params:model_type.pytorch",
"model_params": "params:hyperparams.pytorch",
},
outputs={ # Overriden output catalog entires
"spine": "spine_equipment_level",
"training_set": "train_equipment_level",
"test_set": "test_equipment_level",
"model_object": "pytorch_classifier",
"model_output": "risk_scored_equipment",
},
)
@datajoely
Copy link
Author

The relevant catalog entries would look something like this...

# Here you can define all your data sets by using simple YAML syntax.
#
# Documentation for this file format can be found in "The Data Catalog"
# Link: https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html

spine_time_series:
  type: pandas.ParquetDataSet
  filepath: data/05_model_input/mi_time_series_spine.pq
  layer: model_input

train_time_series:
  type: pandas.ParquetDataSet
  filepath: data/05_model_input/mi_train_time_series.pq
  layer: model_input

test_time_series:
  type: pandas.ParquetDataSet
  filepath: data/05_model_input/mi_test_time_series.pq
  layer: model_input

spine_equipment_level:
  type: pandas.ParquetDataSet
  filepath: data/05_model_input/mi_equipment_level_spine.pq
  layer: model_input

train_equipment_level:
  type: pandas.ParquetDataSet
  filepath: data/05_model_input/mi_train_equipment_level.pq
  layer: model_input

test_equipment_level:
  type: pandas.ParquetDataSet
  filepath: data/05_model_input/mi_test_equipment_level.pq
  layer: model_input

sklearn_predictor:
  type: pickle.PickleDataSet
  filepath: data/06_models/sklearn_predictor.pkl
  layer: models

pytorch_classifier:
  type: pickle.PickleDataSet
  filepath: data/06_models/pytorch_classifier.pkl
  layer: models

recommended_maintenance_schedule:
  type: pandas.ExcelDataSet
  filepath: data/07_model_output/recomended_maintaince_schedule.xlsx
  layer: model_output
  save_args:
    engine: openpyxl

risk_scored_equipment:
  type: kedro.extras.datasets.json.JSONDataSet
  filepath: data/07_model_output/risk_scored_equipment.json
  layer: model_output

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment