Skip to content

Instantly share code, notes, and snippets.

View joshreini1's full-sized avatar

Josh Reini joshreini1

View GitHub Profile
@joshreini1
joshreini1 / get_airbnb_data.py
Created August 30, 2022 19:36
Get AirBnb Data
import requests
import pandas as pd
def get_airbnb_data(city):
#input city with + instead of spaces, e.g. "San+Francisco"
#note: open data soft caps requests at 10,000
url = "https://public.opendatasoft.com/api/records/1.0/search/?dataset=airbnb-listings&q=&rows=10000&refine.city=" + city
resp = requests.get(url)
print(resp.status_code)
data = resp.json()
@joshreini1
joshreini1 / drop_na_airbnb.py
Created August 30, 2022 19:39
Drop empty rows and mostly empty columns
#drop unlabeled data
abnb_pre = abnb_df.dropna(subset='price')
# Delete columns containing either 75% or more than 75% NaN Values
perc = 75.0
min_count = int(((100-perc)/100)*abnb_pre.shape[0] + 1)
abnb_pre = abnb_pre.dropna(axis=1, thresh=min_count)
@joshreini1
joshreini1 / airbnb_pipelines.py
Created August 30, 2022 19:42
Set up airbnb pipelines
from sklearn.pipeline import Pipeline
#instantiate transformer classes
cat_onehot_transformer = cat_onehot_transformer_custom(cat_list)
url_onehot_transformer = url_onehot_transformer_custom(url_list)
convert_dates_transformer = convert_dates_transformer_custom(date_list)
to_float_transformer = to_float_transformer_custom(tofloat_list)
fillna_transformer = fillna_transformer_custom(fillna_list)
cap_reviews_per_month = cap_reviews_per_month_custom()
@joshreini1
joshreini1 / prevent_column_leakage_transformer.py
Created August 30, 2022 19:43
Transformer to prevent column leakage of test
class comprehensive_features_custom(BaseEstimator, TransformerMixin):
def __init__(self, comprehensive_features):
self.comprehensive_features = comprehensive_features
def fit(self, X, y = None):
return self
def transform(self, X):
X_post = X.reindex(labels=comprehensive_features, axis=1)
@joshreini1
joshreini1 / setup_truera_airbnb.py
Last active September 6, 2022 17:07
Set up TruEra workspace, split data and add data collection
import truera
#set up TruEra workspace
from truera.client.truera_workspace import TrueraWorkspace
from truera.client.truera_authentication import BasicAuthentication
auth = BasicAuthentication(USERNAME, PASSWORD)
tru = TrueraWorkspace(CONNECTION_STRING, auth, verify_cert = False)
tru.set_environment("remote")
tru.set_project("airbnb_sf_price", score_type="regression")
#create data collection or schema
@joshreini1
joshreini1 / airbnb_xgb_reg.py
Created August 30, 2022 19:47
instantiate xgboost regression for airbnb
#instantiate model
xgb_reg = xgb.XGBRegressor()
# combine both preprocessing and modeling
xgb_pipe = Pipeline([
('preprocess', comprehensive_preprocessing),
('xgboost', xgb_reg)
])
#fit pipe
xgb_pipe.fit(X_train,y_train)
#add model to truera
@joshreini1
joshreini1 / multihot_transformer.py
Created August 30, 2022 19:48
Multihot transformer
from sklearn.base import BaseEstimator, TransformerMixin
class cat_onehot_transformer_custom(BaseEstimator, TransformerMixin):
def __init__(self, cat_list):
self.cat_list = cat_list
def fit(self, X, y = None):
return self
def transform(self, X):
X_post = X.copy()
for cat in self.cat_list:
X_post[cat] = X_post[cat].str.replace(' ', '_')
@joshreini1
joshreini1 / hyperparameter_space.py
Created August 30, 2022 19:49
Set hyperparameter space
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
#use post data instead of pipe for hyperparameter tuning, get x_test_post
X_test_post = comprehensive_preprocessing.fit_transform(X_test, y_test)
#set hyperparameter space
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
'gamma': hp.uniform ('gamma', 0, 18),
'reg_alpha' : hp.uniform('reg_alpha', 0,1),
'reg_lambda' : hp.uniform('reg_lambda', 0,2),
@joshreini1
joshreini1 / tune_hyperparams_airbnb.py
Created August 30, 2022 19:50
Tune hyperparameters for airbnb
#define the objective function and the hyperparameters we want to tune
def objective(space):
model=xgb.XGBRegressor(
n_estimators =space['n_estimators'],
max_depth = int(space['max_depth']),
gamma = space['gamma'],
reg_alpha = int(space['reg_alpha']),
min_child_weight=int(space['min_child_weight'])
)
@joshreini1
joshreini1 / truera_add_tuned_xgb_airbnb.py
Created August 30, 2022 19:51
Add tuned airbnb xgb to TruEra
#instantiate model
xgb_reg = xgb.XGBRegressor(
gamma = 9.525156702124235,
max_depth = 3,
min_child_weight = 9.517754381682627,
reg_alpha = 0.07435977523021892,
reg_lambda = 1.4770738953975366)
# combine both preprocessing and modeling
xgb_pipe = Pipeline([