noklam/gist:54aab7b0effc9c080adf471ffa498f67

## gistfile1.txt
# Here you can define all your data sets by using simple YAML syntax.
#
# Documentation for this file format can be found in "The Data Catalog"
# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
#
# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
#
# An example data set definition can look as follows:
#
#bikes:
#  type: pandas.CSVDataSet
#  filepath: "data/01_raw/bikes.csv"
#
#weather:
#  type: spark.SparkDataSet
#  filepath: s3a://your_bucket/data/01_raw/weather*
#  file_format: csv
#  credentials: dev_s3
#  load_args:
#    header: True
#    inferSchema: True
#  save_args:
#    sep: '|'
#    header: True
#
#scooters:
#  type: pandas.SQLTableDataSet
#  credentials: scooters_credentials
#  table_name: scooters
#  load_args:
#    index_col: ['name']
#    columns: ['name', 'gear']
#  save_args:
#    if_exists: 'replace'
#    # if_exists: 'fail'
#    # if_exists: 'append'
#
# The Data Catalog supports being able to reference the same file using two different DataSet implementations
# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
# https://kedro.readthedocs.io/en/stable/data/data_catalog.html

companies:
  type: pandas.CSVDataSet
  filepath: data/01_raw/companies.csv
  # more about layers in the Data Engineering Convention:
  # https://kedro.readthedocs.io/en/stable/tutorial/visualise_pipeline.html#interact-with-data-engineering-convention
  layer: raw

reviews:
  type: pandas.CSVDataSet
  filepath: data/01_raw/reviews.csv
  layer: raw

shuttles:
  type: pandas.ExcelDataSet
  filepath: data/01_raw/shuttles.xlsx
  layer: raw

data_processing.preprocessed_companies:
  type: pandas.ParquetDataSet
  filepath: data/02_intermediate/preprocessed_companies.pq
  layer: intermediate

data_processing.preprocessed_shuttles:
  type: pandas.ParquetDataSet
  filepath: data/02_intermediate/preprocessed_shuttles.pq
  layer: intermediate

model_input_table:
  type: pandas.ParquetDataSet
  filepath: data/03_primary/model_input_table.pq
  layer: primary

data_science.active_modelling_pipeline.regressor:
  type: pickle.PickleDataSet
  filepath: data/06_models/regressor_active.pickle
  versioned: true
  layer: models

data_science.candidate_modelling_pipeline.regressor:
  type: pickle.PickleDataSet
  filepath: data/06_models/regressor_candidate.pickle
  versioned: true
  layer: models

data_science.candidate_modelling_pipeline.metrics:
  type: tracking.MetricsDataSet
  filepath: data/09_tracking/metrics_active.json

data_science.active_modelling_pipeline.metrics:
  type: tracking.MetricsDataSet
  filepath: data/09_tracking/metrics_candidate.json

data_processing.companies_columns:
  type: tracking.JSONDataSet
  filepath: data/09_tracking/companies_columns.json
	# Here you can define all your data sets by using simple YAML syntax.
	#
	# Documentation for this file format can be found in "The Data Catalog"
	# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
	#
	# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
	#
	# An example data set definition can look as follows:
	#
	#bikes:
	# type: pandas.CSVDataSet
	# filepath: "data/01_raw/bikes.csv"
	#
	#weather:
	# type: spark.SparkDataSet
	# filepath: s3a://your_bucket/data/01_raw/weather*
	# file_format: csv
	# credentials: dev_s3
	# load_args:
	# header: True
	# inferSchema: True
	# save_args:
	# sep: '\|'
	# header: True
	#
	#scooters:
	# type: pandas.SQLTableDataSet
	# credentials: scooters_credentials
	# table_name: scooters
	# load_args:
	# index_col: ['name']
	# columns: ['name', 'gear']
	# save_args:
	# if_exists: 'replace'
	# # if_exists: 'fail'
	# # if_exists: 'append'
	#
	# The Data Catalog supports being able to reference the same file using two different DataSet implementations
	# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
	# https://kedro.readthedocs.io/en/stable/data/data_catalog.html

	companies:
	type: pandas.CSVDataSet
	filepath: data/01_raw/companies.csv
	# more about layers in the Data Engineering Convention:
	# https://kedro.readthedocs.io/en/stable/tutorial/visualise_pipeline.html#interact-with-data-engineering-convention
	layer: raw

	reviews:
	type: pandas.CSVDataSet
	filepath: data/01_raw/reviews.csv
	layer: raw

	shuttles:
	type: pandas.ExcelDataSet
	filepath: data/01_raw/shuttles.xlsx
	layer: raw

	data_processing.preprocessed_companies:
	type: pandas.ParquetDataSet
	filepath: data/02_intermediate/preprocessed_companies.pq
	layer: intermediate

	data_processing.preprocessed_shuttles:
	type: pandas.ParquetDataSet
	filepath: data/02_intermediate/preprocessed_shuttles.pq
	layer: intermediate

	model_input_table:
	type: pandas.ParquetDataSet
	filepath: data/03_primary/model_input_table.pq
	layer: primary

	data_science.active_modelling_pipeline.regressor:
	type: pickle.PickleDataSet
	filepath: data/06_models/regressor_active.pickle
	versioned: true
	layer: models

	data_science.candidate_modelling_pipeline.regressor:
	type: pickle.PickleDataSet
	filepath: data/06_models/regressor_candidate.pickle
	versioned: true
	layer: models

	data_science.candidate_modelling_pipeline.metrics:
	type: tracking.MetricsDataSet
	filepath: data/09_tracking/metrics_active.json

	data_science.active_modelling_pipeline.metrics:
	type: tracking.MetricsDataSet
	filepath: data/09_tracking/metrics_candidate.json

	data_processing.companies_columns:
	type: tracking.JSONDataSet
	filepath: data/09_tracking/companies_columns.json