This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#instantiate model | |
xgb_reg = xgb.XGBRegressor( | |
gamma = 9.525156702124235, | |
max_depth = 3, | |
min_child_weight = 9.517754381682627, | |
reg_alpha = 0.07435977523021892, | |
reg_lambda = 1.4770738953975366) | |
# combine both preprocessing and modeling | |
xgb_pipe = Pipeline([ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define the objective function and the hyperparameters we want to tune | |
def objective(space): | |
model=xgb.XGBRegressor( | |
n_estimators =space['n_estimators'], | |
max_depth = int(space['max_depth']), | |
gamma = space['gamma'], | |
reg_alpha = int(space['reg_alpha']), | |
min_child_weight=int(space['min_child_weight']) | |
) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe | |
#use post data instead of pipe for hyperparameter tuning, get x_test_post | |
X_test_post = comprehensive_preprocessing.fit_transform(X_test, y_test) | |
#set hyperparameter space | |
space={'max_depth': hp.quniform("max_depth", 3, 18, 1), | |
'gamma': hp.uniform ('gamma', 0, 18), | |
'reg_alpha' : hp.uniform('reg_alpha', 0,1), | |
'reg_lambda' : hp.uniform('reg_lambda', 0,2), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.base import BaseEstimator, TransformerMixin | |
class cat_onehot_transformer_custom(BaseEstimator, TransformerMixin): | |
def __init__(self, cat_list): | |
self.cat_list = cat_list | |
def fit(self, X, y = None): | |
return self | |
def transform(self, X): | |
X_post = X.copy() | |
for cat in self.cat_list: | |
X_post[cat] = X_post[cat].str.replace(' ', '_') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#instantiate model | |
xgb_reg = xgb.XGBRegressor() | |
# combine both preprocessing and modeling | |
xgb_pipe = Pipeline([ | |
('preprocess', comprehensive_preprocessing), | |
('xgboost', xgb_reg) | |
]) | |
#fit pipe | |
xgb_pipe.fit(X_train,y_train) | |
#add model to truera |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class comprehensive_features_custom(BaseEstimator, TransformerMixin): | |
def __init__(self, comprehensive_features): | |
self.comprehensive_features = comprehensive_features | |
def fit(self, X, y = None): | |
return self | |
def transform(self, X): | |
X_post = X.reindex(labels=comprehensive_features, axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.pipeline import Pipeline | |
#instantiate transformer classes | |
cat_onehot_transformer = cat_onehot_transformer_custom(cat_list) | |
url_onehot_transformer = url_onehot_transformer_custom(url_list) | |
convert_dates_transformer = convert_dates_transformer_custom(date_list) | |
to_float_transformer = to_float_transformer_custom(tofloat_list) | |
fillna_transformer = fillna_transformer_custom(fillna_list) | |
cap_reviews_per_month = cap_reviews_per_month_custom() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#drop unlabeled data | |
abnb_pre = abnb_df.dropna(subset='price') | |
# Delete columns containing either 75% or more than 75% NaN Values | |
perc = 75.0 | |
min_count = int(((100-perc)/100)*abnb_pre.shape[0] + 1) | |
abnb_pre = abnb_pre.dropna(axis=1, thresh=min_count) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pandas as pd | |
def get_airbnb_data(city): | |
#input city with + instead of spaces, e.g. "San+Francisco" | |
#note: open data soft caps requests at 10,000 | |
url = "https://public.opendatasoft.com/api/records/1.0/search/?dataset=airbnb-listings&q=&rows=10000&refine.city=" + city | |
resp = requests.get(url) | |
print(resp.status_code) | |
data = resp.json() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import truera | |
#set up TruEra workspace | |
from truera.client.truera_workspace import TrueraWorkspace | |
from truera.client.truera_authentication import BasicAuthentication | |
auth = BasicAuthentication(USERNAME, PASSWORD) | |
tru = TrueraWorkspace(CONNECTION_STRING, auth, verify_cert = False) | |
tru.set_environment("remote") | |
tru.set_project("airbnb_sf_price", score_type="regression") | |
#create data collection or schema |
OlderNewer