Skip to content

Instantly share code, notes, and snippets.

@shuozhang1985
Last active June 3, 2019 12:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save shuozhang1985/691fa6e73ab583e5a2890110f138dc0b to your computer and use it in GitHub Desktop.
Save shuozhang1985/691fa6e73ab583e5a2890110f138dc0b to your computer and use it in GitHub Desktop.
#### linear regression
import numpy as np
import pandas as pd
import os
os.chdir("/Users/shuozhang/Desktop/data")
nycmodel=pd.read_csv('nycmodeldata.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'})
import statsmodels.api as sm
add_dummies = pd.get_dummies(nycmodel['zipcode'])
add_dummies=add_dummies.applymap(np.int)
nycmodel = pd.concat([nycmodel, add_dummies], axis=1)
nycmodel.drop(['zipcode','Unnamed: 0'], inplace=True, axis=1)
target=nycmodel[['count']]
data=nycmodel[[col for col in nycmodel.columns if col not in ['count']]]
import sklearn.cross_validation as cv
x_train, x_test, y_train, y_test = cv.train_test_split(data, target, test_size=2.0/10, random_state=0)
from scipy import stats
from sklearn import linear_model
ols = linear_model.LinearRegression()
ols.fit(x_train, y_train)
'training R^2: %.2f',ols.score(x_train, y_train)
'testing R^2: %.2f',ols.score(x_test, y_test)
from sklearn.metrics import mean_squared_error
'training RMSE:', mean_squared_error(y_train, ols.predict(x_train))
'testing RMSE:', mean_squared_error(y_test, ols.predict(x_test))
#### ridge regression
from __future__ import print_function
from __future__ import division
from sklearn.cross_validation import cross_val_score
from sklearn import linear_model
from bayes_opt import BayesianOptimization
from sklearn import metrics
import math
data=x_train
target=y_train
#### Bayesian Optimization
def Ridgecv(alpha):
return cross_val_score(linear_model.Ridge(alpha=float(alpha), random_state=2),
data, target, 'mean_squared_error', cv=5).mean()
if __name__ == "__main__":
RidgeBO = BayesianOptimization(Ridgecv, {'alpha': (0, 8)})
RidgeBO.maximize(init_points=2, n_iter = 10)
print('Final Results')
print('Ridge: %f' % RidgeBO.res['max']['max_val'])
ridge = linear_model.Ridge(alpha=0.3985)
ridge.fit(x_train, y_train)
ridge.score(x_train, y_train)
ridge.score(x_test, y_test)
mean_squared_error(y_train, ridge.predict(x_train))
mean_squared_error(y_test, ridge.predict(x_test))
#### randomforest
RFR=RandomForestRegressor(max_features=14,n_estimators=300)
RFR.fit(x_train, y_train)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, RFR.predict(x_train))
mean_squared_error(y_test, RFR.predict(x_test))
RFR1=RandomForestRegressor(max_features=14,n_estimators=500)
RFR1.fit(x_train, y_train)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, RFR1.predict(x_train))
mean_squared_error(y_test, RFR1.predict(x_test))
#### xgboost
#### Bayesian Optimization
from __future__ import print_function
from __future__ import division
import xgboost as xgb
from sklearn.cross_validation import cross_val_score
from bayes_opt import bayesian_optimization
def xgboostcv(max_depth,
learning_rate,
n_estimators,
gamma,
min_child_weight,
subsample,
colsample_bytree,
silent=True,
nthread=-1):
return cross_val_score(xgb.XGBRegressor(max_depth=int(max_depth),
learning_rate=learning_rate,
n_estimators=int(n_estimators),
silent=silent,
nthread=nthread,
gamma=gamma,
min_child_weight=min_child_weight,
subsample=subsample,
colsample_bytree=colsample_bytree),
x_train,
y_train,
"mean_squared_error",
cv=5).mean()
if __name__ == "__main__":
xgboostBO = BayesianOptimization(xgboostcv,
{'max_depth': (3, 14),
'learning_rate': (0.01, 0.2),
'n_estimators': (50, 1000),
'gamma': (1., 0.01),
'min_child_weight': (1, 10),
'subsample': (0.5, 1),
'colsample_bytree' :(0.5, 1)})
xgboostBO.maximize(init_points=2, n_iter = 28)
print('-'*53)
print('Final Results')
print('XGBOOST: %f' % xgboostBO.res['max']['max_val'])
XGB=xgb.XGBRegressor(max_depth=14,learning_rate=0.1186,n_estimators=463,silent=True,
nthread=-1,gamma=1.0,min_child_weight=6.1929,subsample=0.9675,colsample_bytree=0.8544)
XGB.fit(x_train, y_train)
XGB.fit(x_test, y_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, XGB.predict(x_train))
mean_squared_error(y_test, XGB.predict(x_test))
#### feature importance
feature_imprtance = zip(x_trainsub.columns, RFR.feature_importances_)
dtype = [('feature', 'S10'), ('importance', 'float')]
feature_imprtance = np.array(feature_imprtance, dtype = dtype)
feature_sort = np.sort(feature_imprtance, order='importance')[::-1]
df=pd.DataFrame(feature_sort)
import pylab as plt
import numpy as np
x = np.arange(1, 21)
y= df['importance']
LABELS = df['feature']
plt.figure()
plt.bar(x, y, align='center')
plt.xticks(x, LABELS)
plt.xlabel('Feature')
plt.ylabel('RFR Importance')
plt.title('RFR importance analysis of top 20 features')
plt.show()
#### ensemble: using linear regression combine two models: randomforest and xgboost
pred_y_test_rf=RFR.predict(x_test)
pred_y_test_rf=pd.DataFrame(pred_y_test, columns=['pred_y_testrf'])
pred_y_train_rf=RFR.predict(x_train)
pred_y_train_rf=pd.DataFrame(pred_y_train, columns=['pred_y_trainrf'])
pred_y_test_xgb=XGB.predict(x_test)
pred_y_test_xgb=pd.DataFrame(pred_y_test, columns=['pred_y_testxgb'])
pred_y_train_xgb=XGB.predict(x_train)
pred_y_train_xgb=pd.DataFrame(pred_y_train, columns=['pred_y_trainxgb'])
pred_y_train_com=pd.concat([pred_y_trainrf,pred_y_trainxgb], axis=1)
from sklearn import linear_model
ols = linear_model.LinearRegression(fit_intercept=False)
ols.fit(pred_y_train_com, y_train)
ols.score(pred_y_train_com, y_train)
from sklearn.metrics import mean_squared_error
import math
math.sqrt(mean_squared_error(y_train, pred_y_train_com))
pred_y_test_com=pd.concat([pred_y_testrf,pred_y_testxgb], axis=1)
pred_y_ensemble=ols.fit(pred_y_test_com)
math.sqrt(mean_squared_error(y_test, pred_y_ensemble))
pred_y_final=pd.concat([y_test, pred_y_testrf,pred_y_testxgb, pred_y_ensemble], axis=1)
pred_y_final1=pred_y_final1.applymap(np.int)
@luoqi23
Copy link

luoqi23 commented Jun 3, 2019

Teacher, can you share this final forecasted dataset, because reading this article has inspired and inspired me, but because in China, because the firewall can't download, the teacher can share the last synthesized data. Set? My email: 827580587@qq.com

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment