zhpmatrix/XGBRegressor.py

## XGBRegressor.py
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import pandas as pd
import scipy as sp
import xgboost as xgb

import matplotlib.pyplot as plt
#%matplotlib inline

data = pd.read_csv('crop_rice.csv')

# Scale the loss_rate column with micro float number
data.ix[:,'loss_rate'] = data.ix[:,'loss_rate']*10000


X=data.ix[:,0:data.shape[1]-1].as_matrix()
y=data.ix[:,data.shape[1]-1].as_matrix()
X=OneHotEncoder().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=668)

xgbr = xgb.XGBRegressor(max_depth=10,
                        learning_rate=0.1,
                        n_estimators=60,
                        silent=True,
                        min_child_weight=1,
                        objective='reg:linear')
xgbr.fit(X_train, y_train, eval_metric='rmse', verbose = False, eval_set = [(X_train,y_train),(X_test, y_test)],early_stopping_rounds=10)


_xgbr = xgb.XGBRegressor(seed=668)
###############################################
## param as max_num_features is can't be used!
###############################################

xgb.plot_importance(xgbr,max_num_features=10)

print 'Best score: ',xgbr.best_score,'Best iteration: ',xgbr.best_iteration

evals_result = xgbr.evals_result()

epochesNum = len(evals_result['validation_0']['rmse'])
epoches = range(0,epochesNum)

plt.figure()
plt.plot(epoches,evals_result['validation_0']['rmse'],label='train')
plt.plot(epoches,evals_result['validation_1']['rmse'],label='test')
plt.xlabel('epoch')
plt.ylabel('rmse')
plt.title('crop_rice_1')
plt.legend()


plt.figure()
preds = xgbr.predict(X_test)
print 'Prediction: ',preds

# r2_score or mean_square_error
print 'MSE: ',sp.sqrt(mean_squared_error(preds,y_test))
print 'R2_score: ',r2_score(y_test,preds)


idxs = range(0,y_test.shape[0])
plt.plot(idxs,y_test,label='real')
plt.plot(idxs,preds,label='predict')
plt.xlabel('index')
plt.ylabel('loss_rate')
plt.title('crop_rice_2')
plt.legend()
plt.show()
	from sklearn.model_selection import train_test_split,GridSearchCV
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.metrics import mean_squared_error
	from sklearn.metrics import r2_score

	import pandas as pd
	import scipy as sp
	import xgboost as xgb

	import matplotlib.pyplot as plt
	#%matplotlib inline

	data = pd.read_csv('crop_rice.csv')

	# Scale the loss_rate column with micro float number
	data.ix[:,'loss_rate'] = data.ix[:,'loss_rate']*10000


	X=data.ix[:,0:data.shape[1]-1].as_matrix()
	y=data.ix[:,data.shape[1]-1].as_matrix()
	X=OneHotEncoder().fit_transform(X)

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=668)

	xgbr = xgb.XGBRegressor(max_depth=10,
	learning_rate=0.1,
	n_estimators=60,
	silent=True,
	min_child_weight=1,
	objective='reg:linear')
	xgbr.fit(X_train, y_train, eval_metric='rmse', verbose = False, eval_set = [(X_train,y_train),(X_test, y_test)],early_stopping_rounds=10)


	_xgbr = xgb.XGBRegressor(seed=668)
	###############################################
	## param as max_num_features is can't be used!
	###############################################

	xgb.plot_importance(xgbr,max_num_features=10)

	print 'Best score: ',xgbr.best_score,'Best iteration: ',xgbr.best_iteration

	evals_result = xgbr.evals_result()

	epochesNum = len(evals_result['validation_0']['rmse'])
	epoches = range(0,epochesNum)

	plt.figure()
	plt.plot(epoches,evals_result['validation_0']['rmse'],label='train')
	plt.plot(epoches,evals_result['validation_1']['rmse'],label='test')
	plt.xlabel('epoch')
	plt.ylabel('rmse')
	plt.title('crop_rice_1')
	plt.legend()


	plt.figure()
	preds = xgbr.predict(X_test)
	print 'Prediction: ',preds

	# r2_score or mean_square_error
	print 'MSE: ',sp.sqrt(mean_squared_error(preds,y_test))
	print 'R2_score: ',r2_score(y_test,preds)


	idxs = range(0,y_test.shape[0])
	plt.plot(idxs,y_test,label='real')
	plt.plot(idxs,preds,label='predict')
	plt.xlabel('index')
	plt.ylabel('loss_rate')
	plt.title('crop_rice_2')
	plt.legend()
	plt.show()