Evanto/ge-pipeline.py

## ge-pipeline.py
import kfp.dsl
from kfp.components import ComponentStore

store = ComponentStore.default_store
chicago_taxi_dataset_op = store.load_component('datasets/Chicago_Taxi_Trips')
validate_csv_op = store.load_component("great-expectations/validate/CSV")

xgboost_train_on_csv_op = store.load_component('XGBoost/Train')

with open('expectation_suite.json') as file:
    expectation_suite = file.read()


@kfp.dsl.pipeline(name='XGBoost Train')
def xgboost_train_pipeline(start_date: str = '2019-01-01',
                           end_date: str = '2019-02-01',
                           limit: int = 100):
    features = ['trip_seconds', 'trip_miles', 'pickup_community_area', 'dropoff_community_area',
                'fare', 'tolls', 'extras', 'trip_total']
    target = 'tips'

    training_data_csv = chicago_taxi_dataset_op(
        select=','.join([target] + features),
        where=f'trip_start_timestamp >= "{start_date}" AND trip_start_timestamp < "{end_date}"',
        limit=limit,
    ).output

    validate_csv = validate_csv_op(training_data_csv, expectation_suite)

    # Training
    training_step = xgboost_train_on_csv_op(
        training_data=training_data_csv,
        label_column=0,
        objective='reg:squarederror',
        num_iterations=200,
    )
    training_step.after(validate_csv)  # Start training only after successful validation
	import kfp.dsl
	from kfp.components import ComponentStore

	store = ComponentStore.default_store
	chicago_taxi_dataset_op = store.load_component('datasets/Chicago_Taxi_Trips')
	validate_csv_op = store.load_component("great-expectations/validate/CSV")

	xgboost_train_on_csv_op = store.load_component('XGBoost/Train')

	with open('expectation_suite.json') as file:
	expectation_suite = file.read()


	@kfp.dsl.pipeline(name='XGBoost Train')
	def xgboost_train_pipeline(start_date: str = '2019-01-01',
	end_date: str = '2019-02-01',
	limit: int = 100):
	features = ['trip_seconds', 'trip_miles', 'pickup_community_area', 'dropoff_community_area',
	'fare', 'tolls', 'extras', 'trip_total']
	target = 'tips'

	training_data_csv = chicago_taxi_dataset_op(
	select=','.join([target] + features),
	where=f'trip_start_timestamp >= "{start_date}" AND trip_start_timestamp < "{end_date}"',
	limit=limit,
	).output

	validate_csv = validate_csv_op(training_data_csv, expectation_suite)

	# Training
	training_step = xgboost_train_on_csv_op(
	training_data=training_data_csv,
	label_column=0,
	objective='reg:squarederror',
	num_iterations=200,
	)
	training_step.after(validate_csv) # Start training only after successful validation