Josh Reini joshreini1

## truera_add_tuned_xgb_airbnb.py
#instantiate model
xgb_reg = xgb.XGBRegressor(
gamma = 9.525156702124235,
max_depth = 3,
min_child_weight = 9.517754381682627,
reg_alpha = 0.07435977523021892,
reg_lambda = 1.4770738953975366)

# combine both preprocessing and modeling
xgb_pipe = Pipeline([

## tune_hyperparams_airbnb.py
#define the objective function and the hyperparameters we want to tune
def objective(space):
   model=xgb.XGBRegressor(
                   n_estimators =space['n_estimators'],
                   max_depth = int(space['max_depth']),
                   gamma = space['gamma'],
                   reg_alpha = int(space['reg_alpha']),
                   min_child_weight=int(space['min_child_weight'])
                   )


## hyperparameter_space.py
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

#use post data instead of pipe for hyperparameter tuning, get x_test_post
X_test_post = comprehensive_preprocessing.fit_transform(X_test, y_test)

#set hyperparameter space
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
       'gamma': hp.uniform ('gamma', 0, 18),
       'reg_alpha' : hp.uniform('reg_alpha', 0,1),
       'reg_lambda' : hp.uniform('reg_lambda', 0,2),

## multihot_transformer.py
from sklearn.base import BaseEstimator, TransformerMixin
class cat_onehot_transformer_custom(BaseEstimator, TransformerMixin):
   def __init__(self, cat_list):
       self.cat_list = cat_list
   def fit(self, X, y = None):
       return self
   def transform(self, X):
       X_post = X.copy()
       for cat in self.cat_list:
           X_post[cat] = X_post[cat].str.replace(' ', '_')

## airbnb_xgb_reg.py
#instantiate model
xgb_reg = xgb.XGBRegressor()
# combine both preprocessing and modeling
xgb_pipe = Pipeline([
   ('preprocess', comprehensive_preprocessing),
   ('xgboost', xgb_reg)
])
#fit pipe
xgb_pipe.fit(X_train,y_train)
#add model to truera

## prevent_column_leakage_transformer.py
class comprehensive_features_custom(BaseEstimator, TransformerMixin):

   def __init__(self, comprehensive_features):
       self.comprehensive_features = comprehensive_features

   def fit(self, X, y = None):
       return self

   def transform(self, X):
       X_post = X.reindex(labels=comprehensive_features, axis=1)

## airbnb_pipelines.py
from sklearn.pipeline import Pipeline

#instantiate transformer classes
cat_onehot_transformer = cat_onehot_transformer_custom(cat_list)
url_onehot_transformer = url_onehot_transformer_custom(url_list)

convert_dates_transformer = convert_dates_transformer_custom(date_list)
to_float_transformer = to_float_transformer_custom(tofloat_list)
fillna_transformer = fillna_transformer_custom(fillna_list)
cap_reviews_per_month = cap_reviews_per_month_custom()

## drop_na_airbnb.py
#drop unlabeled data
abnb_pre = abnb_df.dropna(subset='price')
# Delete columns containing either 75% or more than 75% NaN Values
perc = 75.0
min_count =  int(((100-perc)/100)*abnb_pre.shape[0] + 1)
abnb_pre = abnb_pre.dropna(axis=1, thresh=min_count)

## get_airbnb_data.py
import requests
import pandas as pd

def get_airbnb_data(city):
   #input city with + instead of spaces, e.g. "San+Francisco"
   #note: open data soft caps requests at 10,000
   url = "https://public.opendatasoft.com/api/records/1.0/search/?dataset=airbnb-listings&q=&rows=10000&refine.city=" + city
   resp = requests.get(url)
   print(resp.status_code)
   data = resp.json()

## setup_truera_airbnb.py
import truera
#set up TruEra workspace
from truera.client.truera_workspace import TrueraWorkspace
from truera.client.truera_authentication import BasicAuthentication
auth = BasicAuthentication(USERNAME, PASSWORD)
tru = TrueraWorkspace(CONNECTION_STRING, auth, verify_cert = False)
tru.set_environment("remote")
tru.set_project("airbnb_sf_price", score_type="regression")

#create data collection or schema
	#instantiate model
	xgb_reg = xgb.XGBRegressor(
	gamma = 9.525156702124235,
	max_depth = 3,
	min_child_weight = 9.517754381682627,
	reg_alpha = 0.07435977523021892,
	reg_lambda = 1.4770738953975366)

	# combine both preprocessing and modeling
	xgb_pipe = Pipeline([
	#define the objective function and the hyperparameters we want to tune
	def objective(space):
	model=xgb.XGBRegressor(
	n_estimators =space['n_estimators'],
	max_depth = int(space['max_depth']),
	gamma = space['gamma'],
	reg_alpha = int(space['reg_alpha']),
	min_child_weight=int(space['min_child_weight'])
	)
	from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

	#use post data instead of pipe for hyperparameter tuning, get x_test_post
	X_test_post = comprehensive_preprocessing.fit_transform(X_test, y_test)

	#set hyperparameter space
	space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
	'gamma': hp.uniform ('gamma', 0, 18),
	'reg_alpha' : hp.uniform('reg_alpha', 0,1),
	'reg_lambda' : hp.uniform('reg_lambda', 0,2),
	from sklearn.base import BaseEstimator, TransformerMixin
	class cat_onehot_transformer_custom(BaseEstimator, TransformerMixin):
	def __init__(self, cat_list):
	self.cat_list = cat_list
	def fit(self, X, y = None):
	return self
	def transform(self, X):
	X_post = X.copy()
	for cat in self.cat_list:
	X_post[cat] = X_post[cat].str.replace(' ', '_')
	#instantiate model
	xgb_reg = xgb.XGBRegressor()
	# combine both preprocessing and modeling
	xgb_pipe = Pipeline([
	('preprocess', comprehensive_preprocessing),
	('xgboost', xgb_reg)
	])
	#fit pipe
	xgb_pipe.fit(X_train,y_train)
	#add model to truera
	class comprehensive_features_custom(BaseEstimator, TransformerMixin):

	def __init__(self, comprehensive_features):
	self.comprehensive_features = comprehensive_features

	def fit(self, X, y = None):
	return self

	def transform(self, X):
	X_post = X.reindex(labels=comprehensive_features, axis=1)
	from sklearn.pipeline import Pipeline

	#instantiate transformer classes
	cat_onehot_transformer = cat_onehot_transformer_custom(cat_list)
	url_onehot_transformer = url_onehot_transformer_custom(url_list)

	convert_dates_transformer = convert_dates_transformer_custom(date_list)
	to_float_transformer = to_float_transformer_custom(tofloat_list)
	fillna_transformer = fillna_transformer_custom(fillna_list)
	cap_reviews_per_month = cap_reviews_per_month_custom()
	#drop unlabeled data
	abnb_pre = abnb_df.dropna(subset='price')
	# Delete columns containing either 75% or more than 75% NaN Values
	perc = 75.0
	min_count = int(((100-perc)/100)*abnb_pre.shape[0] + 1)
	abnb_pre = abnb_pre.dropna(axis=1, thresh=min_count)
	import requests
	import pandas as pd

	def get_airbnb_data(city):
	#input city with + instead of spaces, e.g. "San+Francisco"
	#note: open data soft caps requests at 10,000
	url = "https://public.opendatasoft.com/api/records/1.0/search/?dataset=airbnb-listings&q=&rows=10000&refine.city=" + city
	resp = requests.get(url)
	print(resp.status_code)
	data = resp.json()
	import truera
	#set up TruEra workspace
	from truera.client.truera_workspace import TrueraWorkspace
	from truera.client.truera_authentication import BasicAuthentication
	auth = BasicAuthentication(USERNAME, PASSWORD)
	tru = TrueraWorkspace(CONNECTION_STRING, auth, verify_cert = False)
	tru.set_environment("remote")
	tru.set_project("airbnb_sf_price", score_type="regression")

	#create data collection or schema