Skip to content

Instantly share code, notes, and snippets.

View iRahulPandey's full-sized avatar

Rahul Pandey iRahulPandey

View GitHub Profile
# Database and Table used for feature tables
database_name = "My_DB_Name"
table_name = "titanic_dataset_feature"
# initiate
from databricks import feature_store
fs = feature_store.FeatureStoreClient()
import random
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.functions import vector_to_array
def create_features(dataframe):
# take a log of fare
dataframe = dataframe.withColumn("logFare", f.log10(dataframe.Fare))
# family head count
dataframe = dataframe.withColumn("Family", dataframe.SibSp+dataframe.Parch+1)
#Average ticket price
@iRahulPandey
iRahulPandey / Batch_Scoring.py
Last active January 7, 2023 13:45
Null_Value_Removal
def get_latest_model_version(model_name):
latest_version = 1
mlflow_client = MlflowClient()
for mv in mlflow_client.search_model_versions(f"name='{model_name}'"):
version_int = int(mv.version)
if version_int > latest_version:
latest_version = version_int
return latest_version
# just passing the test raw data which doesn't have feture columns
import csv
# Create a file and open a connection
OUT_FILE = 'bayes_test.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)
ITERATION = 0
# Write column names
headers = ['loss', 'std_loss', 'hyperparameters', 'iteration', 'runtime', 'score']
# create a a new search space comparable to grid search and random search and run it for 60 iterations
search_space = {
'boosting_type': hp.choice('boosting_type',
[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 0.51)}]),
'subsample_for_bin': hp.uniform('subsample_for_bin', 20000, 20001),
'min_child_samples': hp.uniform('min_child_samples', 20, 21),
'num_leaves': hp.uniform('num_leaves', 2, 20),
'n_estimators': hp.quniform('n_estimators', 1000, 10000, 1000)
}
from hyperopt import STATUS_OK
from timeit import default_timer as timer
def objective(hyperparameters):
# Keep track of evals
global ITERATION
ITERATION += 1
# library
from sklearn.model_selection import RandomizedSearchCV
# use repeatedfold from sklearn library
model = lgb.LGBMRegressor()
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0)
# train model
# note: RandomizedSearchCV is maximizing the scoring, hence it is used neg_rmse
# Here i Will use 60 itration for Random search
import random
# create function for random search
def random_search(param_grid, max_evals = MAX_EVALS):
# Dataframe for results
results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
index = list(range(MAX_EVALS)))
# Keep searching until reach max evaluations
# use gridsearchcv from sklearn library
model = lgb.LGBMRegressor()
cv = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0)
# train model
# note: GridSearchCV is maximizing the scoring, hence it is used neg_rmse
cv_results = GridSearchCV(model, search_space, scoring='neg_root_mean_squared_error', cv=cv).fit(X_train,y_train)
# import
from sklearn.model_selection import GridSearchCV, RepeatedKFold
# create a small version of parameter grid defined in Domain and name it as search space - Taking best parameter with little variation in number of leaves and number of estimators
search_space = {
'boosting_type': ['gbdt'],
'num_leaves': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
'learning_rate': [0.005],
'subsample_for_bin': [20000],
'min_child_samples': [15],