This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Database and Table used for feature tables | |
database_name = "My_DB_Name" | |
table_name = "titanic_dataset_feature" | |
# initiate | |
from databricks import feature_store | |
fs = feature_store.FeatureStoreClient() | |
import random |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.ml.feature import StringIndexer | |
from pyspark.ml.feature import OneHotEncoder | |
from pyspark.ml.functions import vector_to_array | |
def create_features(dataframe): | |
# take a log of fare | |
dataframe = dataframe.withColumn("logFare", f.log10(dataframe.Fare)) | |
# family head count | |
dataframe = dataframe.withColumn("Family", dataframe.SibSp+dataframe.Parch+1) | |
#Average ticket price |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_latest_model_version(model_name): | |
latest_version = 1 | |
mlflow_client = MlflowClient() | |
for mv in mlflow_client.search_model_versions(f"name='{model_name}'"): | |
version_int = int(mv.version) | |
if version_int > latest_version: | |
latest_version = version_int | |
return latest_version | |
# just passing the test raw data which doesn't have feture columns |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
# Create a file and open a connection | |
OUT_FILE = 'bayes_test.csv' | |
of_connection = open(OUT_FILE, 'w') | |
writer = csv.writer(of_connection) | |
ITERATION = 0 | |
# Write column names | |
headers = ['loss', 'std_loss', 'hyperparameters', 'iteration', 'runtime', 'score'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create a a new search space comparable to grid search and random search and run it for 60 iterations | |
search_space = { | |
'boosting_type': hp.choice('boosting_type', | |
[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 0.51)}]), | |
'subsample_for_bin': hp.uniform('subsample_for_bin', 20000, 20001), | |
'min_child_samples': hp.uniform('min_child_samples', 20, 21), | |
'num_leaves': hp.uniform('num_leaves', 2, 20), | |
'n_estimators': hp.quniform('n_estimators', 1000, 10000, 1000) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from hyperopt import STATUS_OK | |
from timeit import default_timer as timer | |
def objective(hyperparameters): | |
# Keep track of evals | |
global ITERATION | |
ITERATION += 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# library | |
from sklearn.model_selection import RandomizedSearchCV | |
# use repeatedfold from sklearn library | |
model = lgb.LGBMRegressor() | |
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0) | |
# train model | |
# note: RandomizedSearchCV is maximizing the scoring, hence it is used neg_rmse | |
# Here i Will use 60 itration for Random search |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
# create function for random search | |
def random_search(param_grid, max_evals = MAX_EVALS): | |
# Dataframe for results | |
results = pd.DataFrame(columns = ['score', 'params', 'iteration'], | |
index = list(range(MAX_EVALS))) | |
# Keep searching until reach max evaluations |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# use gridsearchcv from sklearn library | |
model = lgb.LGBMRegressor() | |
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0) | |
# train model | |
# note: GridSearchCV is maximizing the scoring, hence it is used neg_rmse | |
cv_results = GridSearchCV(model, search_space, scoring='neg_root_mean_squared_error', cv=cv).fit(X_train,y_train) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import | |
from sklearn.model_selection import GridSearchCV, RepeatedKFold | |
# create a small version of parameter grid defined in Domain and name it as search space - Taking best parameter with little variation in number of leaves and number of estimators | |
search_space = { | |
'boosting_type': ['gbdt'], | |
'num_leaves': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], | |
'learning_rate': [0.005], | |
'subsample_for_bin': [20000], | |
'min_child_samples': [15], |
NewerOlder