CalebFenton/grid_search.py Secret

## grid_search.py
#!/usr/bin/env python

import argh
import sklearn as skl
import sklearn.ensemble
import sklearn.linear_model
import sklearn.model_selection
import sklearn.naive_bayes
import sklearn.neighbors

from utils import get_logger

log = get_logger('gridsearch')


def main(matrix_file='data/full_matrix.pkl', labels_file='data/labels.pkl', jobs=-1, verbose=1, *algorithms):
    log.info("Loading {} and {}".format(matrix_file, labels_file))
    matrix = skl.externals.joblib.load(matrix_file)
    labels = skl.externals.joblib.load(labels_file)
    labels = labels.astype('bool')

    matrix = normalize_matrix(matrix)

    # Just take 200 features to keep it small and fast
    log.info("Univariate feature selection")
    univar_select = sklearn.feature_selection.SelectKBest(k=200)
    matrix = univar_select.fit_transform(matrix, labels)

    # Not necessary unless you have a huge feature matrix ;)
    import gc
    gc.collect()
    gc.collect()

    if 'knn' in algorithms:
        clf = skl.neighbors.KNeighborsClassifier(n_jobs=jobs)
        tuned_parameters = [{
            'weights': ['distance', 'uniform'],
            'n_neighbors': [4, 16, 64],
            'metric': ['euclidean', 'minkowski'],
            'p': [2, 4],
            'leaf_size': [15, 30],
        }]
        grid_search(clf, tuned_parameters, matrix, labels, verbose)

    if 'knn_full' in algorithms:
        clf = skl.neighbors.KNeighborsClassifier()
        tuned_parameters = [{
            'weights': ['distance', 'uniform'],
            'n_neighbors': [1, 2, 3, 4, 5, 10, 20, 30, 50],
            'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
            'p': [2, 4],
            'leaf_size': [15, 30],
        }]
        grid_search(clf, tuned_parameters, matrix, labels, verbose)

    if 'sgd' in algorithms:
        clf = skl.linear_model.SGDClassifier()
        tuned_parameters = [{
            'loss': ['hinge', 'log', 'modified_huber', 'perceptron'],
            'penalty': ['none', 'l2', 'l1', 'elasticnet'],
            'alpha': [0.01, 0.001, 0.0001, 0.00001, 0.000001],
            'l1_ratio': [0.01, 0.15, 0.1, 0.2, 0.5, 0.9],
            'n_iter': [1, 5, 10, 20],
            'power_t': [0.1, 0.5, 0.9, 1.5, 2],
        }]
        grid_search(clf, tuned_parameters, matrix, labels, verbose)

    if 'ada_rf' in algorithms:
        rf = skl.ensemble.RandomForestClassifier(n_jobs=jobs)
        ada_rf = sklearn.ensemble.AdaBoostClassifier(rf, random_state=42)
        tuned_parameters = [{
            "base_estimator__n_estimators": [16, 32],
            "base_estimator__max_depth": [8, 16],
            "n_estimators": [50, 100]
        }]
        grid_search(ada_rf, tuned_parameters, matrix, labels, verbose)

    if 'rbf_svm' in algorithms:
        # Warning: No clue if this works.
        # SVC training time is quadratic with number of samples.
        # Trying to use bagging classifier to break up sample sets.
        svc_base = skl.svm.SVC(
            verbose=verbose,
            random_state=42,
            class_weight='balanced'
        )
        n_estimators = 1000
        clf = skl.ensemble.BaggingClassifier(
            svc_base,
            max_samples=1.0 / n_estimators,
            n_estimators=n_estimators,
        )
        tuned_parameters = [{
            'base_estimator__kernel': ['rbf', 'sigmoid'],
            'base_estimator__probability': [True, False],
            'base_estimator__decision_function_shape': ['ovo', 'ovr'],
            # 'base_estimator__class_weight': [None, 'balanced'],
        }]
        grid_search(clf, tuned_parameters, matrix, labels, verbose)

    if 'rf' in algorithms:
        clf = sklearn.ensemble.RandomForestClassifier(
            verbose=verbose, random_state=42, n_jobs=jobs
        )
        tuned_parameters = [{
            'n_estimators': [50, 150, 300],
            'criterion': ["gini", "entropy"],
            'max_features': ["sqrt", "log2", 0.1, 0.3, None],
            'max_depth': [None, 10, 100],
            'min_samples_split': [2, 4, 8],
            'oob_score': [True, False],
        }]
        grid_search(clf, tuned_parameters, matrix, labels, verbose)

    if 'erf' in algorithms:
        clf = sklearn.ensemble.ExtraTreesClassifier(
            n_jobs=jobs, random_state=42, verbose=verbose,
            oob_score=True, bootstrap=True,
        )
        tuned_parameters = [{
            'n_estimators': [50, 150, 300],
            'criterion': ["gini", "entropy"],
            'max_features': ["sqrt", "log2", 0.1, 0.3, None],
            'max_depth': [None, 10, 100],
            'min_samples_split': [2, 4, 8],
        }]
        grid_search(clf, tuned_parameters, matrix, labels, verbose)


def grid_search(classifier, tuned_parameters, matrix, labels, verbose):
    gs = skl.model_selection.GridSearchCV(classifier, tuned_parameters, cv=3, pre_dispatch=1, verbose=verbose)
    type_name = type(classifier).__name__
    print("Preparing to grid search {}".format(type_name))
    gs.fit(matrix, labels)
    print("{} CV results: {}".format(type_name, gs.cv_results_))
    print("{} best parameters: {}".format(type_name, gs.best_params_))


if __name__ == '__main__':
    argh.dispatch_command(main)

## normalization.py
# Feature matrix normalization and simple feature selection
# Normalization isn't really useful for decision trees (e.g. RandomForest) but important for PCA and VarianceThreshold
def normalize_matrix(matrix, same_in_percent=0.9999):
    # Depending on your data, you may want to normalize. Need to experiment to find out.
    #log.info("Normalizing")
    #normalize = sklearn.preprocessing.Normalizer(copy=False)
    #matrix = normalize.fit_transform(matrix)

    log.info("Scaling")
    # If you have a dense matrix, you can use RobustScaler
    # scale = skl.preprocessing.RobustScaler(copy=False)
    # If you have a sparse matrix, need to use with_mean=False
    # but StandardScaler is probably not good for sparse matrix with a bunch of 0s.
    # https://stackoverflow.com/questions/30918781/right-function-for-normalizing-input-of-sklearn-svm
    #scale = skl.preprocessing.StandardScaler(copy=False, with_std=True, with_mean=False)
    #matrix = scale.fit_transform(matrix)

    # Using Max scaling because I have a sparse matrix with many 0s
    # If you have a dense matrix, MinMaxScaler() may be a better option.
    maxabs = sklearn.preprocessing.MaxAbsScaler(copy=False)
    matrix = maxabs.fit_transform(matrix)

    # Remove features that are the same in, say, 85% of samples
    log.info("Variance selection: {}".format(same_in_percent))
    variance = skl.feature_selection.VarianceThreshold(threshold=(same_in_percent * (1 - same_in_percent)))
    matrix = variance.fit_transform(matrix)
    log.info("New matrix shape: {}".format(matrix.shape))
    return matrix

## rfecv.py
# Recursive feature elimination with cross validation
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html#sklearn.feature_selection.RFECV

# Decision trees don't have coef_ for some strange reason.
class RandomForestClassifierWithCoef(sklearn.ensemble.RandomForestClassifier):
    def fit(self, *args, **kwargs):
        super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
        self.coef_ = self.feature_importances_

# Instead of this, could use some other well-performing model with coef_
clf = RandomForestClassifierWithCoef(n_jobs=16)

# My feature matrix has 1-2 million dimensions, can't use a tiny step size
# For every recursion, drop 0.1% of worse features
# Min feature size is 0.1% of total features (if CV shows that's the best)
# This takes two days to run, if yours finishes in a hour, bump cv up to 3-10 and try step=1
rfecv = sklearn.feature_selection.RFECV(
    estimator=clf,
    step=0.001, cv=2, scoring='roc_auc',
    verbose=2, n_jobs=16
)
X = rfecv.fit_transform(matrix, labels)
log.info("Selected matrix shape: {}".format(X.shape))

## univariate_scoring.py
# Univariate function scoring

# Test different univariate feature scoring functions
# http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
for score_func in (
        sklearn.feature_selection.mutual_info_classif,
        sklearn.feature_selection.f_classif,
        # only use chi2 if features are positive
        #sklearn.feature_selection.chi2,
):
    try:
        score_some_func(matrix, labels, score_func)
    except Exception as e:
        log.error("Failed scoring {}: {}".format(score_func.__name__, e))

def score_some_func(matrix, labels, score_func, target_ks=[100, 1000]):
    log.info("Score function = {}".format(score_func))
    univar_select = sklearn.feature_selection.SelectKBest(score_func)
    univar_select.fit(matrix, labels)
    for target_k in target_ks:
        log.info("Target k = {}".format(target_k))
        out_path = 'univar_feat_select_{}_target_k_{}'.format(score_func.__name__, target_k)
        univar_select.k = target_k
        X = univar_select.transform(matrix)
        univar_out_path = '{}_selctor'.format(out_path)
        # Save this to load later, sort scores_, and plot to get idea of how many features are useful
        # There's often a long tail of useless features
        sklearn.externals.joblib.dump(univar_select, univar_out_path)

        log.info('Performing cross validation')
        # Any classifier would work here, just want something fast
        clf = sklearn.ensemble.ExtraTreesClassifier(
            n_estimators=32, n_jobs=-1, random_state=42, verbose=True
        )
        predicted = sklearn.model_selection.cross_val_predict(clf, X, labels, verbose=True, cv=10)

        pred_out_path = '{}_predicted.pkl'.format(out_path)
        sklearn.externals.joblib.dump(predicted, pred_out_path)
        clf_out_path = '{}_clf.pkl'.format(out_path)
        sklearn.externals.joblib.dump(clf, clf_out_path)

        log.info("  recall: {}".format(sklearn.metrics.recall_score(labels, predicted)))
        log.info("  precision: {}".format(sklearn.metrics.precision_score(labels, predicted)))
        log.info("  accuracy: {}".format(sklearn.metrics.accuracy_score(labels, predicted)))
	#!/usr/bin/env python

	import argh
	import sklearn as skl
	import sklearn.ensemble
	import sklearn.linear_model
	import sklearn.model_selection
	import sklearn.naive_bayes
	import sklearn.neighbors

	from utils import get_logger

	log = get_logger('gridsearch')


	def main(matrix_file='data/full_matrix.pkl', labels_file='data/labels.pkl', jobs=-1, verbose=1, *algorithms):
	log.info("Loading {} and {}".format(matrix_file, labels_file))
	matrix = skl.externals.joblib.load(matrix_file)
	labels = skl.externals.joblib.load(labels_file)
	labels = labels.astype('bool')

	matrix = normalize_matrix(matrix)

	# Just take 200 features to keep it small and fast
	log.info("Univariate feature selection")
	univar_select = sklearn.feature_selection.SelectKBest(k=200)
	matrix = univar_select.fit_transform(matrix, labels)

	# Not necessary unless you have a huge feature matrix ;)
	import gc
	gc.collect()
	gc.collect()

	if 'knn' in algorithms:
	clf = skl.neighbors.KNeighborsClassifier(n_jobs=jobs)
	tuned_parameters = [{
	'weights': ['distance', 'uniform'],
	'n_neighbors': [4, 16, 64],
	'metric': ['euclidean', 'minkowski'],
	'p': [2, 4],
	'leaf_size': [15, 30],
	}]
	grid_search(clf, tuned_parameters, matrix, labels, verbose)

	if 'knn_full' in algorithms:
	clf = skl.neighbors.KNeighborsClassifier()
	tuned_parameters = [{
	'weights': ['distance', 'uniform'],
	'n_neighbors': [1, 2, 3, 4, 5, 10, 20, 30, 50],
	'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
	'p': [2, 4],
	'leaf_size': [15, 30],
	}]
	grid_search(clf, tuned_parameters, matrix, labels, verbose)

	if 'sgd' in algorithms:
	clf = skl.linear_model.SGDClassifier()
	tuned_parameters = [{
	'loss': ['hinge', 'log', 'modified_huber', 'perceptron'],
	'penalty': ['none', 'l2', 'l1', 'elasticnet'],
	'alpha': [0.01, 0.001, 0.0001, 0.00001, 0.000001],
	'l1_ratio': [0.01, 0.15, 0.1, 0.2, 0.5, 0.9],
	'n_iter': [1, 5, 10, 20],
	'power_t': [0.1, 0.5, 0.9, 1.5, 2],
	}]
	grid_search(clf, tuned_parameters, matrix, labels, verbose)

	if 'ada_rf' in algorithms:
	rf = skl.ensemble.RandomForestClassifier(n_jobs=jobs)
	ada_rf = sklearn.ensemble.AdaBoostClassifier(rf, random_state=42)
	tuned_parameters = [{
	"base_estimator__n_estimators": [16, 32],
	"base_estimator__max_depth": [8, 16],
	"n_estimators": [50, 100]
	}]
	grid_search(ada_rf, tuned_parameters, matrix, labels, verbose)

	if 'rbf_svm' in algorithms:
	# Warning: No clue if this works.
	# SVC training time is quadratic with number of samples.
	# Trying to use bagging classifier to break up sample sets.
	svc_base = skl.svm.SVC(
	verbose=verbose,
	random_state=42,
	class_weight='balanced'
	)
	n_estimators = 1000
	clf = skl.ensemble.BaggingClassifier(
	svc_base,
	max_samples=1.0 / n_estimators,
	n_estimators=n_estimators,
	)
	tuned_parameters = [{
	'base_estimator__kernel': ['rbf', 'sigmoid'],
	'base_estimator__probability': [True, False],
	'base_estimator__decision_function_shape': ['ovo', 'ovr'],
	# 'base_estimator__class_weight': [None, 'balanced'],
	}]
	grid_search(clf, tuned_parameters, matrix, labels, verbose)

	if 'rf' in algorithms:
	clf = sklearn.ensemble.RandomForestClassifier(
	verbose=verbose, random_state=42, n_jobs=jobs
	)
	tuned_parameters = [{
	'n_estimators': [50, 150, 300],
	'criterion': ["gini", "entropy"],
	'max_features': ["sqrt", "log2", 0.1, 0.3, None],
	'max_depth': [None, 10, 100],
	'min_samples_split': [2, 4, 8],
	'oob_score': [True, False],
	}]
	grid_search(clf, tuned_parameters, matrix, labels, verbose)

	if 'erf' in algorithms:
	clf = sklearn.ensemble.ExtraTreesClassifier(
	n_jobs=jobs, random_state=42, verbose=verbose,
	oob_score=True, bootstrap=True,
	)
	tuned_parameters = [{
	'n_estimators': [50, 150, 300],
	'criterion': ["gini", "entropy"],
	'max_features': ["sqrt", "log2", 0.1, 0.3, None],
	'max_depth': [None, 10, 100],
	'min_samples_split': [2, 4, 8],
	}]
	grid_search(clf, tuned_parameters, matrix, labels, verbose)


	def grid_search(classifier, tuned_parameters, matrix, labels, verbose):
	gs = skl.model_selection.GridSearchCV(classifier, tuned_parameters, cv=3, pre_dispatch=1, verbose=verbose)
	type_name = type(classifier).__name__
	print("Preparing to grid search {}".format(type_name))
	gs.fit(matrix, labels)
	print("{} CV results: {}".format(type_name, gs.cv_results_))
	print("{} best parameters: {}".format(type_name, gs.best_params_))


	if __name__ == '__main__':
	argh.dispatch_command(main)
	# Feature matrix normalization and simple feature selection
	# Normalization isn't really useful for decision trees (e.g. RandomForest) but important for PCA and VarianceThreshold
	def normalize_matrix(matrix, same_in_percent=0.9999):
	# Depending on your data, you may want to normalize. Need to experiment to find out.
	#log.info("Normalizing")
	#normalize = sklearn.preprocessing.Normalizer(copy=False)
	#matrix = normalize.fit_transform(matrix)

	log.info("Scaling")
	# If you have a dense matrix, you can use RobustScaler
	# scale = skl.preprocessing.RobustScaler(copy=False)
	# If you have a sparse matrix, need to use with_mean=False
	# but StandardScaler is probably not good for sparse matrix with a bunch of 0s.
	# https://stackoverflow.com/questions/30918781/right-function-for-normalizing-input-of-sklearn-svm
	#scale = skl.preprocessing.StandardScaler(copy=False, with_std=True, with_mean=False)
	#matrix = scale.fit_transform(matrix)

	# Using Max scaling because I have a sparse matrix with many 0s
	# If you have a dense matrix, MinMaxScaler() may be a better option.
	maxabs = sklearn.preprocessing.MaxAbsScaler(copy=False)
	matrix = maxabs.fit_transform(matrix)

	# Remove features that are the same in, say, 85% of samples
	log.info("Variance selection: {}".format(same_in_percent))
	variance = skl.feature_selection.VarianceThreshold(threshold=(same_in_percent * (1 - same_in_percent)))
	matrix = variance.fit_transform(matrix)
	log.info("New matrix shape: {}".format(matrix.shape))
	return matrix
	# Recursive feature elimination with cross validation
	# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html#sklearn.feature_selection.RFECV

	# Decision trees don't have coef_ for some strange reason.
	class RandomForestClassifierWithCoef(sklearn.ensemble.RandomForestClassifier):
	def fit(self, args, *kwargs):
	super(RandomForestClassifierWithCoef, self).fit(args, *kwargs)
	self.coef_ = self.feature_importances_

	# Instead of this, could use some other well-performing model with coef_
	clf = RandomForestClassifierWithCoef(n_jobs=16)

	# My feature matrix has 1-2 million dimensions, can't use a tiny step size
	# For every recursion, drop 0.1% of worse features
	# Min feature size is 0.1% of total features (if CV shows that's the best)
	# This takes two days to run, if yours finishes in a hour, bump cv up to 3-10 and try step=1
	rfecv = sklearn.feature_selection.RFECV(
	estimator=clf,
	step=0.001, cv=2, scoring='roc_auc',
	verbose=2, n_jobs=16
	)
	X = rfecv.fit_transform(matrix, labels)
	log.info("Selected matrix shape: {}".format(X.shape))
	# Univariate function scoring

	# Test different univariate feature scoring functions
	# http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
	for score_func in (
	sklearn.feature_selection.mutual_info_classif,
	sklearn.feature_selection.f_classif,
	# only use chi2 if features are positive
	#sklearn.feature_selection.chi2,
	):
	try:
	score_some_func(matrix, labels, score_func)
	except Exception as e:
	log.error("Failed scoring {}: {}".format(score_func.__name__, e))

	def score_some_func(matrix, labels, score_func, target_ks=[100, 1000]):
	log.info("Score function = {}".format(score_func))
	univar_select = sklearn.feature_selection.SelectKBest(score_func)
	univar_select.fit(matrix, labels)
	for target_k in target_ks:
	log.info("Target k = {}".format(target_k))
	out_path = 'univar_feat_select_{}_target_k_{}'.format(score_func.__name__, target_k)
	univar_select.k = target_k
	X = univar_select.transform(matrix)
	univar_out_path = '{}_selctor'.format(out_path)
	# Save this to load later, sort scores_, and plot to get idea of how many features are useful
	# There's often a long tail of useless features
	sklearn.externals.joblib.dump(univar_select, univar_out_path)

	log.info('Performing cross validation')
	# Any classifier would work here, just want something fast
	clf = sklearn.ensemble.ExtraTreesClassifier(
	n_estimators=32, n_jobs=-1, random_state=42, verbose=True
	)
	predicted = sklearn.model_selection.cross_val_predict(clf, X, labels, verbose=True, cv=10)

	pred_out_path = '{}_predicted.pkl'.format(out_path)
	sklearn.externals.joblib.dump(predicted, pred_out_path)
	clf_out_path = '{}_clf.pkl'.format(out_path)
	sklearn.externals.joblib.dump(clf, clf_out_path)

	log.info(" recall: {}".format(sklearn.metrics.recall_score(labels, predicted)))
	log.info(" precision: {}".format(sklearn.metrics.precision_score(labels, predicted)))
	log.info(" accuracy: {}".format(sklearn.metrics.accuracy_score(labels, predicted)))