chrisdmell/catboost.py

## catboost.py
class catboost_regressor():
    '''
    '''

    def __init__(self, param = []):
        '''
        '''

        self._rfr = CatBoostRegressor(**params) ## kwargs  loss_function='RMSE', iterations = 100
        self._param = param


    @classmethod
    def new_instance(cls, param={}):
        '''

        rf_model_best = RF_regressor.new_instance(model_cv.best_params_)

        requires none, but if params is passed, it will call the init call and pass params to it,
        '''
        return cls(param)

    @property
    def model(self):
        """
        Getter to return the model created
        :return: handle or instance of the RandomForestReqgressor

        Property you can use it as a PARAM
        as in rf_model.model will return the model.
        """
        return self._rfr

    @property
    def params(self):
        """
        Getter for model parameters
        """
        return self._param

    def model_run(self, df, var_dict,cat_features = {}, other_dict = {}):
        '''
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default
        '''

        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        X = df[feature]
        y = df[label]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)

#         self._rfr  = self._rfr(cat_features)
## TypeError: 'CatBoostRegressor' object is not callable


        self._rfr.fit(X_train, y_train)
        y_pred = self._rfr.predict(X_test)

        model_score = self._rfr.score(X_test , y_test)

        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)

#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape

#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination

        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["model"] = self.model

        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred

        return_dict["model_score"] = model_score  ## here it is R2

#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy

        ## TODO when model has no param
#         return_dict["param"] = self.params

        return return_dict

    def model_run_cv(self, df, var_dict, other_dict = {}):
        '''
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default - other_dict["parameters"], other_dict["scoring"],
        other_dict["cv"]

        neg_mean_absolute_error - we have to minimize mae, but sklearn works rf on maximization so we negative this
        '''

        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        X = df[feature]
        y = df[label]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        ## this has to be 2*2 matrix

#         self._rfr.fit(X_train, y_train)
#         y_pred = self._rfr.predict(X_test)


        param_grid = other_dict["parameters"]

        # Instantiate the grid search model
        grid_search_ad = GridSearchCV(estimator = self._rfr, param_grid = param_grid,
                                   scoring = other_dict["scoring"], cv = other_dict["cv"],
                                   n_jobs = -1, verbose = 2)

        grid_search_ad.fit(X_train, y_train)
        y_pred = grid_search_ad.predict(X_test)

        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)

        r2_2 = grid_search_ad.score(X_test, y_test)  ##score method of CV Features, Labels

#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape

#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
          ## grid search CV doesnt have this

        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["model_score"] = r2_2
        return_dict["model"] = grid_search_ad
        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred

#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy

        ## TODO when model has no param
#         return_dict["param"] = self.params

        return return_dict

#----------------------------------------- MLFLOW ----------------------------------------------------------#
    def model_run_mlfow(self, df, var_dict, other_dict = {}):
        '''
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default
        '''

        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        X = df[feature]
        y = df[label]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)

        self._rfr.fit(X_train, y_train)
        y_pred = self._rfr.predict(X_test)

        model_score = self._rfr.score(X_test , y_test)

        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)

#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape

        bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination

        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["model"] = self.model

        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred

        return_dict["model_score"] = model_score  ## here it is R2

#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy

        ## TODO when model has no param
#         return_dict["param"] = self.params

        return return_dict

    def model_run_cv_mlfow(self, df, var_dict, other_dict = {}):
        '''
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default - other_dict["parameters"], other_dict["scoring"],
        other_dict["cv"]

        neg_mean_absolute_error - we have to minimize mae, but sklearn works rf on maximization so we negative this
        '''

        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        X = df[feature]
        y = df[label]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        ## this has to be 2*2 matrix

#         self._rfr.fit(X_train, y_train)
#         y_pred = self._rfr.predict(X_test)


        param_grid = other_dict["parameters"]

        # Instantiate the grid search model
        grid_search_ad = GridSearchCV(estimator = self._rfr, param_grid = param_grid,
                                   scoring = other_dict["scoring"], cv = other_dict["cv"],
                                   n_jobs = -1, verbose = 2)

        grid_search_ad.fit(X_train, y_train)
        y_pred = grid_search_ad.predict(X_test)

        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)

        r2_2 = grid_search_ad.score(X_test, y_test)  ##score method of CV Features, Labels

#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape

#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
          ## grid search CV doesnt have this

        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["model_score"] = r2_2
        return_dict["model"] = grid_search_ad
        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred

#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy

        ## TODO when model has no param
#         return_dict["param"] = self.params

        return return_dict
	class catboost_regressor():
	'''
	'''

	def __init__(self, param = []):
	'''
	'''

	self._rfr = CatBoostRegressor(**params) ## kwargs loss_function='RMSE', iterations = 100
	self._param = param


	@classmethod
	def new_instance(cls, param={}):
	'''

	rf_model_best = RF_regressor.new_instance(model_cv.best_params_)

	requires none, but if params is passed, it will call the init call and pass params to it,
	'''
	return cls(param)

	@property
	def model(self):
	"""
	Getter to return the model created
	:return: handle or instance of the RandomForestReqgressor

	Property you can use it as a PARAM
	as in rf_model.model will return the model.
	"""
	return self._rfr

	@property
	def params(self):
	"""
	Getter for model parameters
	"""
	return self._param

	def model_run(self, df, var_dict,cat_features = {}, other_dict = {}):
	'''
	self : rf regressor model
	df : dataframe
	var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
	other_dict : other dict if needed, set to {} default
	'''

	feature = var_dict["independant"]
	label = var_dict["dependant"]
	X = df[feature]
	y = df[label]

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)

	# self._rfr = self._rfr(cat_features)
	## TypeError: 'CatBoostRegressor' object is not callable


	self._rfr.fit(X_train, y_train)
	y_pred = self._rfr.predict(X_test)

	model_score = self._rfr.score(X_test , y_test)

	mae = metrics.mean_absolute_error(y_test, y_pred)
	mse = metrics.mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = metrics.r2_score(y_test, y_pred)

	# errors = abs(y_pred - y_test)
	# mape = 100 * np.mean(errors / y_test)
	# accuracy = 100 - mape

	# bs = self.model.oob_score_ ## OOB score is same as R2, or co-eff of determination

	return_dict = {}
	return_dict["mae"] = mae
	return_dict["mse"] = mse
	return_dict["rmse"] = rmse
	return_dict["r2"] = r2 ## rf_model.score(test, pred) is same as r2
	return_dict["model"] = self.model

	return_dict["y_test"] = y_test
	return_dict["y_pred"] = y_pred

	return_dict["model_score"] = model_score ## here it is R2

	# return_dict["mape"] = mape
	# return_dict["accuracy"] = accuracy

	## TODO when model has no param
	# return_dict["param"] = self.params

	return return_dict

	def model_run_cv(self, df, var_dict, other_dict = {}):
	'''
	self : rf regressor model
	df : dataframe
	var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
	other_dict : other dict if needed, set to {} default - other_dict["parameters"], other_dict["scoring"],
	other_dict["cv"]

	neg_mean_absolute_error - we have to minimize mae, but sklearn works rf on maximization so we negative this
	'''

	feature = var_dict["independant"]
	label = var_dict["dependant"]
	X = df[feature]
	y = df[label]

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
	## this has to be 2*2 matrix

	# self._rfr.fit(X_train, y_train)
	# y_pred = self._rfr.predict(X_test)


	param_grid = other_dict["parameters"]

	# Instantiate the grid search model
	grid_search_ad = GridSearchCV(estimator = self._rfr, param_grid = param_grid,
	scoring = other_dict["scoring"], cv = other_dict["cv"],
	n_jobs = -1, verbose = 2)

	grid_search_ad.fit(X_train, y_train)
	y_pred = grid_search_ad.predict(X_test)

	mae = metrics.mean_absolute_error(y_test, y_pred)
	mse = metrics.mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = metrics.r2_score(y_test, y_pred)

	r2_2 = grid_search_ad.score(X_test, y_test) ##score method of CV Features, Labels

	# errors = abs(y_pred - y_test)
	# mape = 100 * np.mean(errors / y_test)
	# accuracy = 100 - mape

	# bs = self.model.oob_score_ ## OOB score is same as R2, or co-eff of determination
	## grid search CV doesnt have this

	return_dict = {}
	return_dict["mae"] = mae
	return_dict["mse"] = mse
	return_dict["rmse"] = rmse
	return_dict["r2"] = r2 ## rf_model.score(test, pred) is same as r2
	return_dict["model_score"] = r2_2
	return_dict["model"] = grid_search_ad
	return_dict["y_test"] = y_test
	return_dict["y_pred"] = y_pred

	# return_dict["mape"] = mape
	# return_dict["accuracy"] = accuracy

	## TODO when model has no param
	# return_dict["param"] = self.params

	return return_dict

	#----------------------------------------- MLFLOW ----------------------------------------------------------#
	def model_run_mlfow(self, df, var_dict, other_dict = {}):
	'''
	self : rf regressor model
	df : dataframe
	var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
	other_dict : other dict if needed, set to {} default
	'''

	feature = var_dict["independant"]
	label = var_dict["dependant"]
	X = df[feature]
	y = df[label]

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)

	self._rfr.fit(X_train, y_train)
	y_pred = self._rfr.predict(X_test)

	model_score = self._rfr.score(X_test , y_test)

	mae = metrics.mean_absolute_error(y_test, y_pred)
	mse = metrics.mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = metrics.r2_score(y_test, y_pred)

	# errors = abs(y_pred - y_test)
	# mape = 100 * np.mean(errors / y_test)
	# accuracy = 100 - mape

	bs = self.model.oob_score_ ## OOB score is same as R2, or co-eff of determination

	return_dict = {}
	return_dict["mae"] = mae
	return_dict["mse"] = mse
	return_dict["rmse"] = rmse
	return_dict["r2"] = r2 ## rf_model.score(test, pred) is same as r2
	return_dict["model"] = self.model

	return_dict["y_test"] = y_test
	return_dict["y_pred"] = y_pred

	return_dict["model_score"] = model_score ## here it is R2

	# return_dict["mape"] = mape
	# return_dict["accuracy"] = accuracy

	## TODO when model has no param
	# return_dict["param"] = self.params

	return return_dict

	def model_run_cv_mlfow(self, df, var_dict, other_dict = {}):
	'''
	self : rf regressor model
	df : dataframe
	var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
	other_dict : other dict if needed, set to {} default - other_dict["parameters"], other_dict["scoring"],
	other_dict["cv"]

	neg_mean_absolute_error - we have to minimize mae, but sklearn works rf on maximization so we negative this
	'''

	feature = var_dict["independant"]
	label = var_dict["dependant"]
	X = df[feature]
	y = df[label]

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
	## this has to be 2*2 matrix

	# self._rfr.fit(X_train, y_train)
	# y_pred = self._rfr.predict(X_test)


	param_grid = other_dict["parameters"]

	# Instantiate the grid search model
	grid_search_ad = GridSearchCV(estimator = self._rfr, param_grid = param_grid,
	scoring = other_dict["scoring"], cv = other_dict["cv"],
	n_jobs = -1, verbose = 2)

	grid_search_ad.fit(X_train, y_train)
	y_pred = grid_search_ad.predict(X_test)

	mae = metrics.mean_absolute_error(y_test, y_pred)
	mse = metrics.mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = metrics.r2_score(y_test, y_pred)

	r2_2 = grid_search_ad.score(X_test, y_test) ##score method of CV Features, Labels

	# errors = abs(y_pred - y_test)
	# mape = 100 * np.mean(errors / y_test)
	# accuracy = 100 - mape

	# bs = self.model.oob_score_ ## OOB score is same as R2, or co-eff of determination
	## grid search CV doesnt have this

	return_dict = {}
	return_dict["mae"] = mae
	return_dict["mse"] = mse
	return_dict["rmse"] = rmse
	return_dict["r2"] = r2 ## rf_model.score(test, pred) is same as r2
	return_dict["model_score"] = r2_2
	return_dict["model"] = grid_search_ad
	return_dict["y_test"] = y_test
	return_dict["y_pred"] = y_pred

	# return_dict["mape"] = mape
	# return_dict["accuracy"] = accuracy

	## TODO when model has no param
	# return_dict["param"] = self.params

	return return_dict