-
-
Save CalebFenton/66aa04af7b4a4d98efca059cb8c2e7aa to your computer and use it in GitHub Desktop.
Feature Selection and Grid Searching Hyper-parameters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argh | |
import sklearn as skl | |
import sklearn.ensemble | |
import sklearn.linear_model | |
import sklearn.model_selection | |
import sklearn.naive_bayes | |
import sklearn.neighbors | |
from utils import get_logger | |
log = get_logger('gridsearch') | |
def main(matrix_file='data/full_matrix.pkl', labels_file='data/labels.pkl', jobs=-1, verbose=1, *algorithms): | |
log.info("Loading {} and {}".format(matrix_file, labels_file)) | |
matrix = skl.externals.joblib.load(matrix_file) | |
labels = skl.externals.joblib.load(labels_file) | |
labels = labels.astype('bool') | |
matrix = normalize_matrix(matrix) | |
# Just take 200 features to keep it small and fast | |
log.info("Univariate feature selection") | |
univar_select = sklearn.feature_selection.SelectKBest(k=200) | |
matrix = univar_select.fit_transform(matrix, labels) | |
# Not necessary unless you have a huge feature matrix ;) | |
import gc | |
gc.collect() | |
gc.collect() | |
if 'knn' in algorithms: | |
clf = skl.neighbors.KNeighborsClassifier(n_jobs=jobs) | |
tuned_parameters = [{ | |
'weights': ['distance', 'uniform'], | |
'n_neighbors': [4, 16, 64], | |
'metric': ['euclidean', 'minkowski'], | |
'p': [2, 4], | |
'leaf_size': [15, 30], | |
}] | |
grid_search(clf, tuned_parameters, matrix, labels, verbose) | |
if 'knn_full' in algorithms: | |
clf = skl.neighbors.KNeighborsClassifier() | |
tuned_parameters = [{ | |
'weights': ['distance', 'uniform'], | |
'n_neighbors': [1, 2, 3, 4, 5, 10, 20, 30, 50], | |
'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'], | |
'p': [2, 4], | |
'leaf_size': [15, 30], | |
}] | |
grid_search(clf, tuned_parameters, matrix, labels, verbose) | |
if 'sgd' in algorithms: | |
clf = skl.linear_model.SGDClassifier() | |
tuned_parameters = [{ | |
'loss': ['hinge', 'log', 'modified_huber', 'perceptron'], | |
'penalty': ['none', 'l2', 'l1', 'elasticnet'], | |
'alpha': [0.01, 0.001, 0.0001, 0.00001, 0.000001], | |
'l1_ratio': [0.01, 0.15, 0.1, 0.2, 0.5, 0.9], | |
'n_iter': [1, 5, 10, 20], | |
'power_t': [0.1, 0.5, 0.9, 1.5, 2], | |
}] | |
grid_search(clf, tuned_parameters, matrix, labels, verbose) | |
if 'ada_rf' in algorithms: | |
rf = skl.ensemble.RandomForestClassifier(n_jobs=jobs) | |
ada_rf = sklearn.ensemble.AdaBoostClassifier(rf, random_state=42) | |
tuned_parameters = [{ | |
"base_estimator__n_estimators": [16, 32], | |
"base_estimator__max_depth": [8, 16], | |
"n_estimators": [50, 100] | |
}] | |
grid_search(ada_rf, tuned_parameters, matrix, labels, verbose) | |
if 'rbf_svm' in algorithms: | |
# Warning: No clue if this works. | |
# SVC training time is quadratic with number of samples. | |
# Trying to use bagging classifier to break up sample sets. | |
svc_base = skl.svm.SVC( | |
verbose=verbose, | |
random_state=42, | |
class_weight='balanced' | |
) | |
n_estimators = 1000 | |
clf = skl.ensemble.BaggingClassifier( | |
svc_base, | |
max_samples=1.0 / n_estimators, | |
n_estimators=n_estimators, | |
) | |
tuned_parameters = [{ | |
'base_estimator__kernel': ['rbf', 'sigmoid'], | |
'base_estimator__probability': [True, False], | |
'base_estimator__decision_function_shape': ['ovo', 'ovr'], | |
# 'base_estimator__class_weight': [None, 'balanced'], | |
}] | |
grid_search(clf, tuned_parameters, matrix, labels, verbose) | |
if 'rf' in algorithms: | |
clf = sklearn.ensemble.RandomForestClassifier( | |
verbose=verbose, random_state=42, n_jobs=jobs | |
) | |
tuned_parameters = [{ | |
'n_estimators': [50, 150, 300], | |
'criterion': ["gini", "entropy"], | |
'max_features': ["sqrt", "log2", 0.1, 0.3, None], | |
'max_depth': [None, 10, 100], | |
'min_samples_split': [2, 4, 8], | |
'oob_score': [True, False], | |
}] | |
grid_search(clf, tuned_parameters, matrix, labels, verbose) | |
if 'erf' in algorithms: | |
clf = sklearn.ensemble.ExtraTreesClassifier( | |
n_jobs=jobs, random_state=42, verbose=verbose, | |
oob_score=True, bootstrap=True, | |
) | |
tuned_parameters = [{ | |
'n_estimators': [50, 150, 300], | |
'criterion': ["gini", "entropy"], | |
'max_features': ["sqrt", "log2", 0.1, 0.3, None], | |
'max_depth': [None, 10, 100], | |
'min_samples_split': [2, 4, 8], | |
}] | |
grid_search(clf, tuned_parameters, matrix, labels, verbose) | |
def grid_search(classifier, tuned_parameters, matrix, labels, verbose): | |
gs = skl.model_selection.GridSearchCV(classifier, tuned_parameters, cv=3, pre_dispatch=1, verbose=verbose) | |
type_name = type(classifier).__name__ | |
print("Preparing to grid search {}".format(type_name)) | |
gs.fit(matrix, labels) | |
print("{} CV results: {}".format(type_name, gs.cv_results_)) | |
print("{} best parameters: {}".format(type_name, gs.best_params_)) | |
if __name__ == '__main__': | |
argh.dispatch_command(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Feature matrix normalization and simple feature selection | |
# Normalization isn't really useful for decision trees (e.g. RandomForest) but important for PCA and VarianceThreshold | |
def normalize_matrix(matrix, same_in_percent=0.9999): | |
# Depending on your data, you may want to normalize. Need to experiment to find out. | |
#log.info("Normalizing") | |
#normalize = sklearn.preprocessing.Normalizer(copy=False) | |
#matrix = normalize.fit_transform(matrix) | |
log.info("Scaling") | |
# If you have a dense matrix, you can use RobustScaler | |
# scale = skl.preprocessing.RobustScaler(copy=False) | |
# If you have a sparse matrix, need to use with_mean=False | |
# but StandardScaler is probably not good for sparse matrix with a bunch of 0s. | |
# https://stackoverflow.com/questions/30918781/right-function-for-normalizing-input-of-sklearn-svm | |
#scale = skl.preprocessing.StandardScaler(copy=False, with_std=True, with_mean=False) | |
#matrix = scale.fit_transform(matrix) | |
# Using Max scaling because I have a sparse matrix with many 0s | |
# If you have a dense matrix, MinMaxScaler() may be a better option. | |
maxabs = sklearn.preprocessing.MaxAbsScaler(copy=False) | |
matrix = maxabs.fit_transform(matrix) | |
# Remove features that are the same in, say, 85% of samples | |
log.info("Variance selection: {}".format(same_in_percent)) | |
variance = skl.feature_selection.VarianceThreshold(threshold=(same_in_percent * (1 - same_in_percent))) | |
matrix = variance.fit_transform(matrix) | |
log.info("New matrix shape: {}".format(matrix.shape)) | |
return matrix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Recursive feature elimination with cross validation | |
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html#sklearn.feature_selection.RFECV | |
# Decision trees don't have coef_ for some strange reason. | |
class RandomForestClassifierWithCoef(sklearn.ensemble.RandomForestClassifier): | |
def fit(self, *args, **kwargs): | |
super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs) | |
self.coef_ = self.feature_importances_ | |
# Instead of this, could use some other well-performing model with coef_ | |
clf = RandomForestClassifierWithCoef(n_jobs=16) | |
# My feature matrix has 1-2 million dimensions, can't use a tiny step size | |
# For every recursion, drop 0.1% of worse features | |
# Min feature size is 0.1% of total features (if CV shows that's the best) | |
# This takes two days to run, if yours finishes in a hour, bump cv up to 3-10 and try step=1 | |
rfecv = sklearn.feature_selection.RFECV( | |
estimator=clf, | |
step=0.001, cv=2, scoring='roc_auc', | |
verbose=2, n_jobs=16 | |
) | |
X = rfecv.fit_transform(matrix, labels) | |
log.info("Selected matrix shape: {}".format(X.shape)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Univariate function scoring | |
# Test different univariate feature scoring functions | |
# http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection | |
for score_func in ( | |
sklearn.feature_selection.mutual_info_classif, | |
sklearn.feature_selection.f_classif, | |
# only use chi2 if features are positive | |
#sklearn.feature_selection.chi2, | |
): | |
try: | |
score_some_func(matrix, labels, score_func) | |
except Exception as e: | |
log.error("Failed scoring {}: {}".format(score_func.__name__, e)) | |
def score_some_func(matrix, labels, score_func, target_ks=[100, 1000]): | |
log.info("Score function = {}".format(score_func)) | |
univar_select = sklearn.feature_selection.SelectKBest(score_func) | |
univar_select.fit(matrix, labels) | |
for target_k in target_ks: | |
log.info("Target k = {}".format(target_k)) | |
out_path = 'univar_feat_select_{}_target_k_{}'.format(score_func.__name__, target_k) | |
univar_select.k = target_k | |
X = univar_select.transform(matrix) | |
univar_out_path = '{}_selctor'.format(out_path) | |
# Save this to load later, sort scores_, and plot to get idea of how many features are useful | |
# There's often a long tail of useless features | |
sklearn.externals.joblib.dump(univar_select, univar_out_path) | |
log.info('Performing cross validation') | |
# Any classifier would work here, just want something fast | |
clf = sklearn.ensemble.ExtraTreesClassifier( | |
n_estimators=32, n_jobs=-1, random_state=42, verbose=True | |
) | |
predicted = sklearn.model_selection.cross_val_predict(clf, X, labels, verbose=True, cv=10) | |
pred_out_path = '{}_predicted.pkl'.format(out_path) | |
sklearn.externals.joblib.dump(predicted, pred_out_path) | |
clf_out_path = '{}_clf.pkl'.format(out_path) | |
sklearn.externals.joblib.dump(clf, clf_out_path) | |
log.info(" recall: {}".format(sklearn.metrics.recall_score(labels, predicted))) | |
log.info(" precision: {}".format(sklearn.metrics.precision_score(labels, predicted))) | |
log.info(" accuracy: {}".format(sklearn.metrics.accuracy_score(labels, predicted))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The way this works is you normalize your matrix and run it through these to get some idea of how many features are important. Then you combine the selected features from a few of them with http://scikit-learn.org/stable/auto_examples/feature_stacker.html The feature selection helps drop useless features, improve training time and possibly prediction time, and helps generalize the model (by dropping odd-ball features which can lead to over fitting).
I'm experimenting with this now for my Android models. Going to combine top 1-2k features from f_classif and mutual information and all features from RFE. That should make the matrix small enough to experiment with KernelPCA and grid searching. Right now it takes two weeks to do a meaningful grid search on a 32 core, 64 gig ram machine.
If k=100 or k=1000 perform really badly, maybe try k=2000 or 3000. If k=100 performs the same as k=1000, just use k=100, etc.
Two things not here which might be useful are some kind of PCA to maximize variance and feature agglomeration. I don't use these two until I'm really fighting for gains because you lose feature names and have no feedback for feature engineering. Both are kind of hard to use with high rank feature matricies because you can run out of memory. Interestingly, I was able to to use feature agglomeration to take ~30k features of our PE detection model (down from ~1 million features with f_classif feature selection and variance threshold) and cluster them into 100 features without much loss in model performance -- 2-3%. Freaking crazy that 2 million super diverse files, about half malware, can be fairly accurately judged with only 100 features.