Created
September 29, 2018 05:51
-
-
Save Omkaragrawal/70abd1cbde10e9e87514734f3c4d880c to your computer and use it in GitHub Desktop.
Chat bot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
sns.set(style='white') | |
import matplotlib.pyplot as plt | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.cross_validation import train_test_split | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import KFold | |
from sklearn.model_selection import cross_val_score | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.linear_model import LinearRegression | |
from sklearn.linear_model import Lasso | |
from sklearn.linear_model import ElasticNet | |
from sklearn.tree import DecisionTreeRegressor | |
from sklearn.neighbors import KNeighborsRegressor | |
from sklearn.svm import SVR | |
from sklearn.pipeline import Pipeline | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.ensemble import GradientBoostingRegressor | |
from sklearn.ensemble import ExtraTreesRegressor | |
from sklearn.ensemble import AdaBoostRegressor | |
from sklearn.metrics import mean_squared_error | |
df = pd.read_csv("datasets.csv") | |
#print df.shape | |
df = df.drop('PlaceofBirth',1) | |
#print df.head(5) | |
print (df.describe()) | |
ls = ['gender','Relation','Topic','SectionID','GradeID','NationalITy','Class','StageID','Semester','ParentAnsweringSurvey','ParentschoolSatisfaction','StudentAbsenceDays'] | |
for i in ls: | |
g = sns.factorplot(i,data=df,kind='count',size=3) | |
print (df.shape) | |
#preprocessing | |
target = df.pop('Class') | |
X = pd.get_dummies(df) | |
le = LabelEncoder() | |
y = le.fit_transform(target) | |
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0) | |
ss = StandardScaler() | |
#print X_train | |
X_train_std = ss.fit_transform(X_train) | |
X_test_std = ss.fit_transform(X_test) | |
#print X_train_std | |
#dimensionality_reduction | |
feat_labels = X.columns[:58] | |
forest = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=-1) | |
forest.fit(X_train,y_train) | |
importances = forest.feature_importances_ | |
indices = np.argsort(importances)[::-1] | |
for f in range(X_train.shape[1]): | |
print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]])) | |
h = sns.barplot(importances[indices],feat_labels[indices]) | |
#removing dimensions | |
X_train_new = X_train | |
X_test_new = X_test | |
ls = ['VisITedResources','raisedhands','AnnouncementsView','StudentAbsenceDays_Above-7','StudentAbsenceDays_Under-7','Discussion'] | |
for i in X_train.columns: | |
if i in ls: | |
pass | |
else: | |
X_train_new.drop(i , axis=1, inplace=True) | |
for i in X_test.columns: | |
if i in ls: | |
pass | |
else: | |
X_test_new.drop(i , axis=1, inplace=True) | |
#spot checking algorithms | |
models = [] | |
models.append(('LR', LinearRegression())) | |
models.append(('LASSO', Lasso())) | |
models.append(('EN', ElasticNet())) | |
models.append(('KNN', KNeighborsRegressor())) | |
models.append(('CART', DecisionTreeRegressor())) | |
models.append(('SVR', SVR())) | |
# evaluate each model in turn | |
results = [] | |
names = [] | |
for name, model in models: | |
kfold = KFold(n_splits=10, random_state=7) | |
cv_results = cross_val_score(model, X_train_new, y_train, cv=kfold, scoring='neg_mean_squared_error') | |
results.append(cv_results) | |
names.append(name) | |
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) | |
print(msg) | |
# Standardize the dataset | |
pipelines = [] | |
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())]))) | |
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO',Lasso())]))) | |
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN',ElasticNet())]))) | |
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',KNeighborsRegressor())]))) | |
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART',DecisionTreeRegressor())]))) | |
pipelines.append(('ScaledSVR', Pipeline([('Scaler', StandardScaler()),('SVR', SVR())]))) | |
results = [] | |
names = [] | |
for name, model in pipelines: | |
kfold = KFold(n_splits=10, random_state=7) | |
cv_results = cross_val_score(model, X_train_new, y_train, cv=kfold, scoring='neg_mean_squared_error') | |
results.append(cv_results) | |
names.append(name) | |
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) | |
print(msg) | |
fig = plt.figure() | |
fig.suptitle('Scaled Algorithm Comparison') | |
ax = fig.add_subplot(111) | |
plt.boxplot(results) | |
ax.set_xticklabels(names) | |
plt.show() | |
#lasso algorithm tuning | |
scaler = StandardScaler().fit(X_train) | |
rescaledX = scaler.transform(X_train) | |
k_values = np.array([.1,.11,.12,.13,.14,.15,.16,.09,.08,.07,.06,.05,.04]) | |
param_grid = dict(alpha=k_values) | |
model = Lasso() | |
kfold = KFold(n_splits=10, random_state=7) | |
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kfold) | |
grid_result = grid.fit(rescaledX, y_train) | |
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) | |
means = grid_result.cv_results_['mean_test_score'] | |
stds = grid_result.cv_results_['std_test_score'] | |
params = grid_result.cv_results_['params'] | |
for mean, stdev, param in zip(means, stds, params): | |
print("%f (%f) with: %r" % (mean, stdev, param)) | |
#using ensembles | |
ensembles = [] | |
ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()),('AB',AdaBoostRegressor())]))) | |
ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM',GradientBoostingRegressor())]))) | |
ensembles.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF',RandomForestRegressor())]))) | |
ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()),('ET',ExtraTreesRegressor())]))) | |
results = [] | |
names = [] | |
for name, model in ensembles: | |
kfold = KFold(n_splits=10, random_state=7) | |
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error') | |
results.append(cv_results) | |
names.append(name) | |
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) | |
print(msg) | |
# Compare Algorithms | |
fig = plt.figure() | |
fig.suptitle('Scaled Ensemble Algorithm Comparison') | |
ax = fig.add_subplot(111) | |
plt.boxplot(results) | |
ax.set_xticklabels(names) | |
plt.show() | |
# Tune scaled AdaboostRegressor | |
scaler = StandardScaler().fit(X_train) | |
rescaledX = scaler.transform(X_train) | |
param_grid = dict(n_estimators=np.array([50,100,150,200,250,300,350,400])) | |
model = AdaBoostRegressor(random_state=7) | |
kfold = KFold(n_splits=10, random_state=7) | |
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kfold) | |
grid_result = grid.fit(rescaledX, y_train) | |
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) | |
means = grid_result.cv_results_['mean_test_score'] | |
stds = grid_result.cv_results_['std_test_score'] | |
params = grid_result.cv_results_['params'] | |
for mean, stdev, param in zip(means, stds, params): | |
print("%f (%f) with: %r" % (mean, stdev, param)) | |
# prepare the model | |
scaler = StandardScaler().fit(X_train) | |
rescaledX = scaler.transform(X_train) | |
model = GradientBoostingRegressor(random_state=7, n_estimators=400) | |
model.fit(rescaledX, y_train) | |
# transform the validation dataset | |
rescaledValidationX = scaler.transform(X_test) | |
predictions = model.predict(rescaledValidationX) | |
print(mean_squared_error(y_test, predictions)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment