Rahul Pandey iRahulPandey

## Feature_Store_Creation.py
# Database and Table used for feature tables
database_name = "My_DB_Name"
table_name = "titanic_dataset_feature"

# initiate
from databricks import feature_store
fs = feature_store.FeatureStoreClient()

import random

## Feature_Transform.py
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.functions import vector_to_array

def create_features(dataframe):
    # take a log of fare
    dataframe = dataframe.withColumn("logFare", f.log10(dataframe.Fare))
    # family head count
    dataframe = dataframe.withColumn("Family", dataframe.SibSp+dataframe.Parch+1)
    #Average ticket price

## Batch_Scoring.py
def get_latest_model_version(model_name):
    latest_version = 1
    mlflow_client = MlflowClient()
    for mv in mlflow_client.search_model_versions(f"name='{model_name}'"):
      version_int = int(mv.version)
      if version_int > latest_version:
        latest_version = version_int
    return latest_version

# just passing the test raw data which doesn't have feture columns

## save_and_evaluate.py
import csv
# Create a file and open a connection
OUT_FILE = 'bayes_test.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

ITERATION = 0

# Write column names
headers = ['loss', 'std_loss', 'hyperparameters', 'iteration', 'runtime', 'score']

## search_space_bayesian.py
# create a a new search space comparable to grid search and random search and run it for 60 iterations
search_space = {
    'boosting_type': hp.choice('boosting_type',
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 0.51)}]),
    'subsample_for_bin': hp.uniform('subsample_for_bin', 20000, 20001),
    'min_child_samples': hp.uniform('min_child_samples', 20, 21),
    'num_leaves': hp.uniform('num_leaves', 2, 20),
    'n_estimators': hp.quniform('n_estimators', 1000, 10000, 1000)
}

## objective_function_bayesian.py
from hyperopt import STATUS_OK
from timeit import default_timer as timer


def objective(hyperparameters):

    # Keep track of evals
    global ITERATION

    ITERATION += 1

## randomsearchcv.py
# library
from sklearn.model_selection import RandomizedSearchCV

# use repeatedfold from sklearn library
model = lgb.LGBMRegressor()
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0)

# train model
# note: RandomizedSearchCV is maximizing the scoring, hence it is used neg_rmse
# Here i Will use 60 itration for Random search

## random_search.py
import random

# create function for random search
def random_search(param_grid, max_evals = MAX_EVALS):

    # Dataframe for results
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                                  index = list(range(MAX_EVALS)))

    # Keep searching until reach max evaluations

## gridsearchcv.py
# use gridsearchcv from sklearn library
model = lgb.LGBMRegressor()
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0)

# train model
# note: GridSearchCV is maximizing the scoring, hence it is used neg_rmse
cv_results = GridSearchCV(model, search_space, scoring='neg_root_mean_squared_error', cv=cv).fit(X_train,y_train)

## smaller_search_space.py
# import
from sklearn.model_selection import GridSearchCV, RepeatedKFold

# create a small version of parameter grid defined in Domain and name it as search space - Taking best parameter with little variation in number of leaves and number of estimators
search_space = {
    'boosting_type': ['gbdt'],
    'num_leaves': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
    'learning_rate': [0.005],
    'subsample_for_bin': [20000],
    'min_child_samples': [15],
	# Database and Table used for feature tables
	database_name = "My_DB_Name"
	table_name = "titanic_dataset_feature"

	# initiate
	from databricks import feature_store
	fs = feature_store.FeatureStoreClient()

	import random
	from pyspark.ml.feature import StringIndexer
	from pyspark.ml.feature import OneHotEncoder
	from pyspark.ml.functions import vector_to_array

	def create_features(dataframe):
	# take a log of fare
	dataframe = dataframe.withColumn("logFare", f.log10(dataframe.Fare))
	# family head count
	dataframe = dataframe.withColumn("Family", dataframe.SibSp+dataframe.Parch+1)
	#Average ticket price
	def get_latest_model_version(model_name):
	latest_version = 1
	mlflow_client = MlflowClient()
	for mv in mlflow_client.search_model_versions(f"name='{model_name}'"):
	version_int = int(mv.version)
	if version_int > latest_version:
	latest_version = version_int
	return latest_version

	# just passing the test raw data which doesn't have feture columns
	import csv
	# Create a file and open a connection
	OUT_FILE = 'bayes_test.csv'
	of_connection = open(OUT_FILE, 'w')
	writer = csv.writer(of_connection)

	ITERATION = 0

	# Write column names
	headers = ['loss', 'std_loss', 'hyperparameters', 'iteration', 'runtime', 'score']
	# create a a new search space comparable to grid search and random search and run it for 60 iterations
	search_space = {
	'boosting_type': hp.choice('boosting_type',
	[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 0.51)}]),
	'subsample_for_bin': hp.uniform('subsample_for_bin', 20000, 20001),
	'min_child_samples': hp.uniform('min_child_samples', 20, 21),
	'num_leaves': hp.uniform('num_leaves', 2, 20),
	'n_estimators': hp.quniform('n_estimators', 1000, 10000, 1000)
	}
	from hyperopt import STATUS_OK
	from timeit import default_timer as timer


	def objective(hyperparameters):

	# Keep track of evals
	global ITERATION

	ITERATION += 1
	# library
	from sklearn.model_selection import RandomizedSearchCV

	# use repeatedfold from sklearn library
	model = lgb.LGBMRegressor()
	cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0)

	# train model
	# note: RandomizedSearchCV is maximizing the scoring, hence it is used neg_rmse
	# Here i Will use 60 itration for Random search
	import random

	# create function for random search
	def random_search(param_grid, max_evals = MAX_EVALS):

	# Dataframe for results
	results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
	index = list(range(MAX_EVALS)))

	# Keep searching until reach max evaluations
	# use gridsearchcv from sklearn library
	model = lgb.LGBMRegressor()
	cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0)

	# train model
	# note: GridSearchCV is maximizing the scoring, hence it is used neg_rmse
	cv_results = GridSearchCV(model, search_space, scoring='neg_root_mean_squared_error', cv=cv).fit(X_train,y_train)
	# import
	from sklearn.model_selection import GridSearchCV, RepeatedKFold

	# create a small version of parameter grid defined in Domain and name it as search space - Taking best parameter with little variation in number of leaves and number of estimators
	search_space = {
	'boosting_type': ['gbdt'],
	'num_leaves': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
	'learning_rate': [0.005],
	'subsample_for_bin': [20000],
	'min_child_samples': [15],