Skip to content

Instantly share code, notes, and snippets.

@muthmano-dev
Forked from shuozhang1985/modeling
Created May 12, 2017 13:00
Show Gist options
  • Save muthmano-dev/9dc0ed4b9ff713d8dc4dbea44069c9a3 to your computer and use it in GitHub Desktop.
Save muthmano-dev/9dc0ed4b9ff713d8dc4dbea44069c9a3 to your computer and use it in GitHub Desktop.
#### linear regression
import numpy as np
import pandas as pd
import os
os.chdir("/Users/shuozhang/Desktop/data")
nycmodel=pd.read_csv('nycmodeldata.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'})
import statsmodels.api as sm
add_dummies = pd.get_dummies(nycmodel['zipcode'])
add_dummies=add_dummies.applymap(np.int)
nycmodel = pd.concat([nycmodel, add_dummies], axis=1)
nycmodel.drop(['zipcode','Unnamed: 0'], inplace=True, axis=1)
target=nycmodel[['count']]
data=nycmodel[[col for col in nycmodel.columns if col not in ['count']]]
import sklearn.cross_validation as cv
x_train, x_test, y_train, y_test = cv.train_test_split(data, target, test_size=2.0/10, random_state=0)
from scipy import stats
from sklearn import linear_model
ols = linear_model.LinearRegression()
ols.fit(x_train, y_train)
'training R^2: %.2f',ols.score(x_train, y_train)
'testing R^2: %.2f',ols.score(x_test, y_test)
from sklearn.metrics import mean_squared_error
'training RMSE:', mean_squared_error(y_train, ols.predict(x_train))
'testing RMSE:', mean_squared_error(y_test, ols.predict(x_test))
#### ridge regression
from __future__ import print_function
from __future__ import division
from sklearn.cross_validation import cross_val_score
from sklearn import linear_model
from bayes_opt import BayesianOptimization
from sklearn import metrics
import math
data=x_train
target=y_train
#### Bayesian Optimization
def Ridgecv(alpha):
return cross_val_score(linear_model.Ridge(alpha=float(alpha), random_state=2),
data, target, 'mean_squared_error', cv=5).mean()
if __name__ == "__main__":
RidgeBO = BayesianOptimization(Ridgecv, {'alpha': (0, 8)})
RidgeBO.maximize(init_points=2, n_iter = 10)
print('Final Results')
print('Ridge: %f' % RidgeBO.res['max']['max_val'])
ridge = linear_model.Ridge(alpha=0.3985)
ridge.fit(x_train, y_train)
ridge.score(x_train, y_train)
ridge.score(x_test, y_test)
mean_squared_error(y_train, ridge.predict(x_train))
mean_squared_error(y_test, ridge.predict(x_test))
#### randomforest
RFR=RandomForestRegressor(max_features=14,n_estimators=300)
RFR.fit(x_train, y_train)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, RFR.predict(x_train))
mean_squared_error(y_test, RFR.predict(x_test))
RFR1=RandomForestRegressor(max_features=14,n_estimators=500)
RFR1.fit(x_train, y_train)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, RFR1.predict(x_train))
mean_squared_error(y_test, RFR1.predict(x_test))
#### xgboost
#### Bayesian Optimization
from __future__ import print_function
from __future__ import division
import xgboost as xgb
from sklearn.cross_validation import cross_val_score
from bayes_opt import bayesian_optimization
def xgboostcv(max_depth,
learning_rate,
n_estimators,
gamma,
min_child_weight,
subsample,
colsample_bytree,
silent=True,
nthread=-1):
return cross_val_score(xgb.XGBRegressor(max_depth=int(max_depth),
learning_rate=learning_rate,
n_estimators=int(n_estimators),
silent=silent,
nthread=nthread,
gamma=gamma,
min_child_weight=min_child_weight,
subsample=subsample,
colsample_bytree=colsample_bytree),
x_train,
y_train,
"mean_squared_error",
cv=5).mean()
if __name__ == "__main__":
xgboostBO = BayesianOptimization(xgboostcv,
{'max_depth': (3, 14),
'learning_rate': (0.01, 0.2),
'n_estimators': (50, 1000),
'gamma': (1., 0.01),
'min_child_weight': (1, 10),
'subsample': (0.5, 1),
'colsample_bytree' :(0.5, 1)})
xgboostBO.maximize(init_points=2, n_iter = 28)
print('-'*53)
print('Final Results')
print('XGBOOST: %f' % xgboostBO.res['max']['max_val'])
XGB=xgb.XGBRegressor(max_depth=14,learning_rate=0.1186,n_estimators=463,silent=True,
nthread=-1,gamma=1.0,min_child_weight=6.1929,subsample=0.9675,colsample_bytree=0.8544)
XGB.fit(x_train, y_train)
XGB.fit(x_test, y_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, XGB.predict(x_train))
mean_squared_error(y_test, XGB.predict(x_test))
#### feature importance
feature_imprtance = zip(x_trainsub.columns, RFR.feature_importances_)
dtype = [('feature', 'S10'), ('importance', 'float')]
feature_imprtance = np.array(feature_imprtance, dtype = dtype)
feature_sort = np.sort(feature_imprtance, order='importance')[::-1]
df=pd.DataFrame(feature_sort)
import pylab as plt
import numpy as np
x = np.arange(1, 21)
y= df['importance']
LABELS = df['feature']
plt.figure()
plt.bar(x, y, align='center')
plt.xticks(x, LABELS)
plt.xlabel('Feature')
plt.ylabel('RFR Importance')
plt.title('RFR importance analysis of top 20 features')
plt.show()
#### ensemble: using linear regression combine two models: randomforest and xgboost
pred_y_test_rf=RFR.predict(x_test)
pred_y_test_rf=pd.DataFrame(pred_y_test, columns=['pred_y_testrf'])
pred_y_train_rf=RFR.predict(x_train)
pred_y_train_rf=pd.DataFrame(pred_y_train, columns=['pred_y_trainrf'])
pred_y_test_xgb=XGB.predict(x_test)
pred_y_test_xgb=pd.DataFrame(pred_y_test, columns=['pred_y_testxgb'])
pred_y_train_xgb=XGB.predict(x_train)
pred_y_train_xgb=pd.DataFrame(pred_y_train, columns=['pred_y_trainxgb'])
pred_y_train_com=pd.concat([pred_y_trainrf,pred_y_trainxgb], axis=1)
from sklearn import linear_model
ols = linear_model.LinearRegression(fit_intercept=False)
ols.fit(pred_y_train_com, y_train)
ols.score(pred_y_train_com, y_train)
from sklearn.metrics import mean_squared_error
import math
math.sqrt(mean_squared_error(y_train, pred_y_train_com))
pred_y_test_com=pd.concat([pred_y_testrf,pred_y_testxgb], axis=1)
pred_y_ensemble=ols.fit(pred_y_test_com)
math.sqrt(mean_squared_error(y_test, pred_y_ensemble))
pred_y_final=pd.concat([y_test, pred_y_testrf,pred_y_testxgb, pred_y_ensemble], axis=1)
pred_y_final1=pred_y_final1.applymap(np.int)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment