Skip to content

Instantly share code, notes, and snippets.

@Evanto
Created April 12, 2021 21:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Evanto/dcecfb245503bd3537d7514e1227354e to your computer and use it in GitHub Desktop.
Save Evanto/dcecfb245503bd3537d7514e1227354e to your computer and use it in GitHub Desktop.
import kfp.dsl
from kfp.components import ComponentStore
store = ComponentStore.default_store
chicago_taxi_dataset_op = store.load_component('datasets/Chicago_Taxi_Trips')
validate_csv_op = store.load_component("great-expectations/validate/CSV")
xgboost_train_on_csv_op = store.load_component('XGBoost/Train')
with open('expectation_suite.json') as file:
expectation_suite = file.read()
@kfp.dsl.pipeline(name='XGBoost Train')
def xgboost_train_pipeline(start_date: str = '2019-01-01',
end_date: str = '2019-02-01',
limit: int = 100):
features = ['trip_seconds', 'trip_miles', 'pickup_community_area', 'dropoff_community_area',
'fare', 'tolls', 'extras', 'trip_total']
target = 'tips'
training_data_csv = chicago_taxi_dataset_op(
select=','.join([target] + features),
where=f'trip_start_timestamp >= "{start_date}" AND trip_start_timestamp < "{end_date}"',
limit=limit,
).output
validate_csv = validate_csv_op(training_data_csv, expectation_suite)
# Training
training_step = xgboost_train_on_csv_op(
training_data=training_data_csv,
label_column=0,
objective='reg:squarederror',
num_iterations=200,
)
training_step.after(validate_csv) # Start training only after successful validation
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment