Created
October 11, 2016 11:41
-
-
Save ryllada/1a83bd49cb025f24b0cdcbb4a0601c81 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
from sklearn.grid_search import GridSearchCV | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.linear_model import Perceptron | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import ExtraTreesClassifier | |
from sklearn.ensemble import GradientBoostingClassifier | |
from sklearn.svm import SVC | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import AdaBoostClassifier | |
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | |
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis | |
from features_preprocess import preprocess | |
import pandas as pd | |
__author__ = 'Rodolfo Yllada (ryllada@gmail.com)' | |
__filename__ = 'titanic_train.csv' | |
class ClassifiersEvaluation(object): | |
data = None | |
predictors_base = [ | |
"PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", | |
"Ticket", "Fare", "Cabin", "Embarked", ] | |
predictors = [] | |
target = "Survived" | |
def __init__(self, filename): | |
self.data = pd.read_csv(filename) | |
self.data, self.predictors = preprocess(self.data, self.predictors_base) | |
return | |
def make_classification(self): | |
names = [ | |
"Nearest Neighbors", | |
"Linear SVM", | |
"RBF SVM", | |
"Decision Tree", | |
"Random Forest", | |
"Extra Trees", | |
"Gradient Boosting", | |
"AdaBoost", | |
"Naive Bayes", | |
"Linear Discriminant Analysis", | |
"Quadratic Discriminant Analysis", | |
"Logistic Regression", | |
"Perceptron", | |
] | |
params_classifiers = { | |
"Nearest Neighbors": { | |
'n_neighbors': range(2, 10), | |
'weights': ('uniform', 'distance', ), }, | |
"Linear SVM": { | |
'kernel': ('linear', ), | |
'C': [1, 10, ], | |
'gamma': [0.001, 0.0001], | |
'class_weight': ('balanced', None, ), }, | |
"RBF SVM": { | |
'kernel': ('rbf', ), | |
'C': [1, 10, ], | |
'gamma': [0.001, 0.0001], | |
'class_weight': ('balanced', None, ), }, | |
"Decision Tree": { | |
'max_depth': (5, 10, 20, 50, 100, 500, ), | |
'max_features': range( | |
len(self.predictors) - 5, len(self.predictors)), }, | |
"Random Forest": { | |
'max_depth': (5, 10, 20, ), | |
'n_estimators': (50, 100, ), | |
'max_features': range( | |
len(self.predictors) - 5, len(self.predictors)), }, | |
"Extra Trees": { | |
'max_depth': (5, 10, 20, ), | |
'n_estimators': (50, 100, ), | |
'max_features': range( | |
len(self.predictors) - 5, len(self.predictors)), }, | |
"Gradient Boosting": { | |
'n_estimators': [50, 100, ], | |
'learning_rate': [.1, 0.05, 0.01, 0.005, 0.001, ], }, | |
"AdaBoost": { | |
'n_estimators': [50, 100, ], | |
'learning_rate': [.1, 0.05, 0.01, 0.005, 0.001, ], }, | |
"Naive Bayes": {}, | |
"Linear Discriminant Analysis": {}, | |
"Quadratic Discriminant Analysis": {}, | |
"Logistic Regression": { | |
'penalty': ('l1', 'l2', ), | |
'C': [0.001, 0.01, 0.1, 1, 10, 100, 500, ], }, | |
"Perceptron": { | |
'penalty': ('l1', 'l2', ), | |
'alpha': [0.1, 0.01, ], | |
'n_iter': [1, 2, 5, 10, 100, 500, ], } | |
} | |
classifiers = [ | |
KNeighborsClassifier(), | |
SVC(), | |
SVC(), | |
DecisionTreeClassifier(), | |
RandomForestClassifier(), | |
ExtraTreesClassifier(), | |
GradientBoostingClassifier(), | |
AdaBoostClassifier(), | |
GaussianNB(), | |
LinearDiscriminantAnalysis(), | |
QuadraticDiscriminantAnalysis(), | |
LogisticRegression(), | |
Perceptron(), | |
] | |
print "=====================================" | |
dict_result = {} | |
for name, classifier in zip(names, classifiers): | |
if name in ('Linear SVM', 'RBF SVM', 'Random Forest', 'Extra Trees', | |
'Gradient Boosting', 'AdaBoost', 'Logistic Regression', | |
'Perceptron', ): | |
verbose = 2 | |
else: | |
verbose = 0 | |
print name | |
param_search = GridSearchCV( | |
estimator=classifier, param_grid=params_classifiers[name], | |
verbose=verbose) | |
param_search.fit(self.data[self.predictors], self.data[self.target]) | |
dict_result[name] = { | |
'score': param_search.best_score_, | |
'params': param_search.best_params_, | |
'feature_importances': | |
param_search.best_estimator_.feature_importances_ | |
if hasattr( | |
param_search.best_estimator_, 'feature_importances_') | |
else None, } | |
print 'Score:', param_search.best_score_ | |
print 'Params:', param_search.best_params_ | |
print "=====================================" | |
print "RESULTS:" | |
for key in sorted(dict_result.keys(), | |
key=lambda x: dict_result[x]['score'], reverse=True): | |
print "%.2f - %s - %s" % ( | |
dict_result[key]['score'], key, dict_result[key]['params'], ) | |
if dict_result[key]['feature_importances'] is not None: | |
for variable, factor in sorted( | |
zip(self.predictors, | |
dict_result[key]['feature_importances']), | |
key=lambda x: x[1], reverse=True): | |
print "%s: %f" % ( | |
variable.capitalize().replace("_", " "), | |
factor, ) | |
print "=====================================" | |
return | |
if __name__ == '__main__': | |
c_eval = ClassifiersEvaluation(__filename__) | |
c_eval.make_classification() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment