Omkaragrawal/chatbot.py

## chatbot.py
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style='white')
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error

df = pd.read_csv("datasets.csv")
#print df.shape

df = df.drop('PlaceofBirth',1)
#print df.head(5)

print (df.describe())

ls = ['gender','Relation','Topic','SectionID','GradeID','NationalITy','Class','StageID','Semester','ParentAnsweringSurvey','ParentschoolSatisfaction','StudentAbsenceDays']

for i in ls:
    g = sns.factorplot(i,data=df,kind='count',size=3)

print (df.shape)

#preprocessing

target = df.pop('Class')

X = pd.get_dummies(df)

le = LabelEncoder()
y = le.fit_transform(target)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
ss = StandardScaler()
#print X_train
X_train_std = ss.fit_transform(X_train)
X_test_std = ss.fit_transform(X_test)
#print X_train_std

#dimensionality_reduction

feat_labels = X.columns[:58]
forest = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=-1)
forest.fit(X_train,y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]]))
h = sns.barplot(importances[indices],feat_labels[indices])

#removing dimensions

X_train_new = X_train
X_test_new = X_test

ls = ['VisITedResources','raisedhands','AnnouncementsView','StudentAbsenceDays_Above-7','StudentAbsenceDays_Under-7','Discussion']

for i in X_train.columns:
    if i in ls:
        pass
    else:
        X_train_new.drop(i , axis=1, inplace=True)

for i in X_test.columns:
    if i in ls:
        pass
    else:
        X_test_new.drop(i , axis=1, inplace=True)


#spot checking algorithms

models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))


# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X_train_new, y_train, cv=kfold, scoring='neg_mean_squared_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


# Standardize the dataset
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO',Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN',ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART',DecisionTreeRegressor())])))
pipelines.append(('ScaledSVR', Pipeline([('Scaler', StandardScaler()),('SVR', SVR())])))
results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X_train_new, y_train, cv=kfold, scoring='neg_mean_squared_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

fig = plt.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

#lasso algorithm tuning
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
k_values = np.array([.1,.11,.12,.13,.14,.15,.16,.09,.08,.07,.06,.05,.04])
param_grid = dict(alpha=k_values)
model = Lasso()
kfold = KFold(n_splits=10, random_state=7)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kfold)
grid_result = grid.fit(rescaledX, y_train)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

#using ensembles
ensembles = []
ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()),('AB',AdaBoostRegressor())])))
ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM',GradientBoostingRegressor())])))
ensembles.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF',RandomForestRegressor())])))
ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()),('ET',ExtraTreesRegressor())])))
results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Compare Algorithms
fig = plt.figure()
fig.suptitle('Scaled Ensemble Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Tune scaled AdaboostRegressor
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = dict(n_estimators=np.array([50,100,150,200,250,300,350,400]))
model = AdaBoostRegressor(random_state=7)
kfold = KFold(n_splits=10, random_state=7)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kfold)
grid_result = grid.fit(rescaledX, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# prepare the model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = GradientBoostingRegressor(random_state=7, n_estimators=400)
model.fit(rescaledX, y_train)

# transform the validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model.predict(rescaledValidationX)
print(mean_squared_error(y_test, predictions))
	import pandas as pd
	import numpy as np
	import seaborn as sns
	sns.set(style='white')
	import matplotlib.pyplot as plt
	from sklearn.preprocessing import LabelEncoder
	from sklearn.cross_validation import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import KFold
	from sklearn.model_selection import cross_val_score
	from sklearn.model_selection import GridSearchCV
	from sklearn.linear_model import LinearRegression
	from sklearn.linear_model import Lasso
	from sklearn.linear_model import ElasticNet
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn.svm import SVR
	from sklearn.pipeline import Pipeline
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.ensemble import ExtraTreesRegressor
	from sklearn.ensemble import AdaBoostRegressor
	from sklearn.metrics import mean_squared_error

	df = pd.read_csv("datasets.csv")
	#print df.shape

	df = df.drop('PlaceofBirth',1)
	#print df.head(5)

	print (df.describe())

	ls = ['gender','Relation','Topic','SectionID','GradeID','NationalITy','Class','StageID','Semester','ParentAnsweringSurvey','ParentschoolSatisfaction','StudentAbsenceDays']

	for i in ls:
	g = sns.factorplot(i,data=df,kind='count',size=3)

	print (df.shape)

	#preprocessing

	target = df.pop('Class')

	X = pd.get_dummies(df)

	le = LabelEncoder()
	y = le.fit_transform(target)

	X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
	ss = StandardScaler()
	#print X_train
	X_train_std = ss.fit_transform(X_train)
	X_test_std = ss.fit_transform(X_test)
	#print X_train_std

	#dimensionality_reduction

	feat_labels = X.columns[:58]
	forest = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=-1)
	forest.fit(X_train,y_train)
	importances = forest.feature_importances_
	indices = np.argsort(importances)[::-1]
	for f in range(X_train.shape[1]):
	print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]],importances[indices[f]]))
	h = sns.barplot(importances[indices],feat_labels[indices])

	#removing dimensions

	X_train_new = X_train
	X_test_new = X_test

	ls = ['VisITedResources','raisedhands','AnnouncementsView','StudentAbsenceDays_Above-7','StudentAbsenceDays_Under-7','Discussion']

	for i in X_train.columns:
	if i in ls:
	pass
	else:
	X_train_new.drop(i , axis=1, inplace=True)

	for i in X_test.columns:
	if i in ls:
	pass
	else:
	X_test_new.drop(i , axis=1, inplace=True)


	#spot checking algorithms

	models = []
	models.append(('LR', LinearRegression()))
	models.append(('LASSO', Lasso()))
	models.append(('EN', ElasticNet()))
	models.append(('KNN', KNeighborsRegressor()))
	models.append(('CART', DecisionTreeRegressor()))
	models.append(('SVR', SVR()))


	# evaluate each model in turn
	results = []
	names = []
	for name, model in models:
	kfold = KFold(n_splits=10, random_state=7)
	cv_results = cross_val_score(model, X_train_new, y_train, cv=kfold, scoring='neg_mean_squared_error')
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)


	# Standardize the dataset
	pipelines = []
	pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
	pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO',Lasso())])))
	pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN',ElasticNet())])))
	pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',KNeighborsRegressor())])))
	pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART',DecisionTreeRegressor())])))
	pipelines.append(('ScaledSVR', Pipeline([('Scaler', StandardScaler()),('SVR', SVR())])))
	results = []
	names = []
	for name, model in pipelines:
	kfold = KFold(n_splits=10, random_state=7)
	cv_results = cross_val_score(model, X_train_new, y_train, cv=kfold, scoring='neg_mean_squared_error')
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

	fig = plt.figure()
	fig.suptitle('Scaled Algorithm Comparison')
	ax = fig.add_subplot(111)
	plt.boxplot(results)
	ax.set_xticklabels(names)
	plt.show()

	#lasso algorithm tuning
	scaler = StandardScaler().fit(X_train)
	rescaledX = scaler.transform(X_train)
	k_values = np.array([.1,.11,.12,.13,.14,.15,.16,.09,.08,.07,.06,.05,.04])
	param_grid = dict(alpha=k_values)
	model = Lasso()
	kfold = KFold(n_splits=10, random_state=7)
	grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kfold)
	grid_result = grid.fit(rescaledX, y_train)


	print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
	means = grid_result.cv_results_['mean_test_score']
	stds = grid_result.cv_results_['std_test_score']
	params = grid_result.cv_results_['params']
	for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

	#using ensembles
	ensembles = []
	ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()),('AB',AdaBoostRegressor())])))
	ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM',GradientBoostingRegressor())])))
	ensembles.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF',RandomForestRegressor())])))
	ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()),('ET',ExtraTreesRegressor())])))
	results = []
	names = []
	for name, model in ensembles:
	kfold = KFold(n_splits=10, random_state=7)
	cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

	# Compare Algorithms
	fig = plt.figure()
	fig.suptitle('Scaled Ensemble Algorithm Comparison')
	ax = fig.add_subplot(111)
	plt.boxplot(results)
	ax.set_xticklabels(names)
	plt.show()

	# Tune scaled AdaboostRegressor
	scaler = StandardScaler().fit(X_train)
	rescaledX = scaler.transform(X_train)
	param_grid = dict(n_estimators=np.array([50,100,150,200,250,300,350,400]))
	model = AdaBoostRegressor(random_state=7)
	kfold = KFold(n_splits=10, random_state=7)
	grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kfold)
	grid_result = grid.fit(rescaledX, y_train)

	print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
	means = grid_result.cv_results_['mean_test_score']
	stds = grid_result.cv_results_['std_test_score']
	params = grid_result.cv_results_['params']
	for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

	# prepare the model
	scaler = StandardScaler().fit(X_train)
	rescaledX = scaler.transform(X_train)
	model = GradientBoostingRegressor(random_state=7, n_estimators=400)
	model.fit(rescaledX, y_train)

	# transform the validation dataset
	rescaledValidationX = scaler.transform(X_test)
	predictions = model.predict(rescaledValidationX)
	print(mean_squared_error(y_test, predictions))