Skip to content

Instantly share code, notes, and snippets.

@ryllada
Created October 11, 2016 11:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ryllada/1a83bd49cb025f24b0cdcbb4a0601c81 to your computer and use it in GitHub Desktop.
Save ryllada/1a83bd49cb025f24b0cdcbb4a0601c81 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# encoding: utf-8
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from features_preprocess import preprocess
import pandas as pd
__author__ = 'Rodolfo Yllada (ryllada@gmail.com)'
__filename__ = 'titanic_train.csv'
class ClassifiersEvaluation(object):
data = None
predictors_base = [
"PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch",
"Ticket", "Fare", "Cabin", "Embarked", ]
predictors = []
target = "Survived"
def __init__(self, filename):
self.data = pd.read_csv(filename)
self.data, self.predictors = preprocess(self.data, self.predictors_base)
return
def make_classification(self):
names = [
"Nearest Neighbors",
"Linear SVM",
"RBF SVM",
"Decision Tree",
"Random Forest",
"Extra Trees",
"Gradient Boosting",
"AdaBoost",
"Naive Bayes",
"Linear Discriminant Analysis",
"Quadratic Discriminant Analysis",
"Logistic Regression",
"Perceptron",
]
params_classifiers = {
"Nearest Neighbors": {
'n_neighbors': range(2, 10),
'weights': ('uniform', 'distance', ), },
"Linear SVM": {
'kernel': ('linear', ),
'C': [1, 10, ],
'gamma': [0.001, 0.0001],
'class_weight': ('balanced', None, ), },
"RBF SVM": {
'kernel': ('rbf', ),
'C': [1, 10, ],
'gamma': [0.001, 0.0001],
'class_weight': ('balanced', None, ), },
"Decision Tree": {
'max_depth': (5, 10, 20, 50, 100, 500, ),
'max_features': range(
len(self.predictors) - 5, len(self.predictors)), },
"Random Forest": {
'max_depth': (5, 10, 20, ),
'n_estimators': (50, 100, ),
'max_features': range(
len(self.predictors) - 5, len(self.predictors)), },
"Extra Trees": {
'max_depth': (5, 10, 20, ),
'n_estimators': (50, 100, ),
'max_features': range(
len(self.predictors) - 5, len(self.predictors)), },
"Gradient Boosting": {
'n_estimators': [50, 100, ],
'learning_rate': [.1, 0.05, 0.01, 0.005, 0.001, ], },
"AdaBoost": {
'n_estimators': [50, 100, ],
'learning_rate': [.1, 0.05, 0.01, 0.005, 0.001, ], },
"Naive Bayes": {},
"Linear Discriminant Analysis": {},
"Quadratic Discriminant Analysis": {},
"Logistic Regression": {
'penalty': ('l1', 'l2', ),
'C': [0.001, 0.01, 0.1, 1, 10, 100, 500, ], },
"Perceptron": {
'penalty': ('l1', 'l2', ),
'alpha': [0.1, 0.01, ],
'n_iter': [1, 2, 5, 10, 100, 500, ], }
}
classifiers = [
KNeighborsClassifier(),
SVC(),
SVC(),
DecisionTreeClassifier(),
RandomForestClassifier(),
ExtraTreesClassifier(),
GradientBoostingClassifier(),
AdaBoostClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
LogisticRegression(),
Perceptron(),
]
print "====================================="
dict_result = {}
for name, classifier in zip(names, classifiers):
if name in ('Linear SVM', 'RBF SVM', 'Random Forest', 'Extra Trees',
'Gradient Boosting', 'AdaBoost', 'Logistic Regression',
'Perceptron', ):
verbose = 2
else:
verbose = 0
print name
param_search = GridSearchCV(
estimator=classifier, param_grid=params_classifiers[name],
verbose=verbose)
param_search.fit(self.data[self.predictors], self.data[self.target])
dict_result[name] = {
'score': param_search.best_score_,
'params': param_search.best_params_,
'feature_importances':
param_search.best_estimator_.feature_importances_
if hasattr(
param_search.best_estimator_, 'feature_importances_')
else None, }
print 'Score:', param_search.best_score_
print 'Params:', param_search.best_params_
print
print
print "====================================="
print "RESULTS:"
print
for key in sorted(dict_result.keys(),
key=lambda x: dict_result[x]['score'], reverse=True):
print "%.2f - %s - %s" % (
dict_result[key]['score'], key, dict_result[key]['params'], )
if dict_result[key]['feature_importances'] is not None:
for variable, factor in sorted(
zip(self.predictors,
dict_result[key]['feature_importances']),
key=lambda x: x[1], reverse=True):
print "%s: %f" % (
variable.capitalize().replace("_", " "),
factor, )
print
print
print "====================================="
return
if __name__ == '__main__':
c_eval = ClassifiersEvaluation(__filename__)
c_eval.make_classification()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment