Skip to content

Instantly share code, notes, and snippets.

@CalebFenton
Last active August 22, 2017 18:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CalebFenton/66aa04af7b4a4d98efca059cb8c2e7aa to your computer and use it in GitHub Desktop.
Save CalebFenton/66aa04af7b4a4d98efca059cb8c2e7aa to your computer and use it in GitHub Desktop.
Feature Selection and Grid Searching Hyper-parameters
#!/usr/bin/env python
import argh
import sklearn as skl
import sklearn.ensemble
import sklearn.linear_model
import sklearn.model_selection
import sklearn.naive_bayes
import sklearn.neighbors
from utils import get_logger
log = get_logger('gridsearch')
def main(matrix_file='data/full_matrix.pkl', labels_file='data/labels.pkl', jobs=-1, verbose=1, *algorithms):
log.info("Loading {} and {}".format(matrix_file, labels_file))
matrix = skl.externals.joblib.load(matrix_file)
labels = skl.externals.joblib.load(labels_file)
labels = labels.astype('bool')
matrix = normalize_matrix(matrix)
# Just take 200 features to keep it small and fast
log.info("Univariate feature selection")
univar_select = sklearn.feature_selection.SelectKBest(k=200)
matrix = univar_select.fit_transform(matrix, labels)
# Not necessary unless you have a huge feature matrix ;)
import gc
gc.collect()
gc.collect()
if 'knn' in algorithms:
clf = skl.neighbors.KNeighborsClassifier(n_jobs=jobs)
tuned_parameters = [{
'weights': ['distance', 'uniform'],
'n_neighbors': [4, 16, 64],
'metric': ['euclidean', 'minkowski'],
'p': [2, 4],
'leaf_size': [15, 30],
}]
grid_search(clf, tuned_parameters, matrix, labels, verbose)
if 'knn_full' in algorithms:
clf = skl.neighbors.KNeighborsClassifier()
tuned_parameters = [{
'weights': ['distance', 'uniform'],
'n_neighbors': [1, 2, 3, 4, 5, 10, 20, 30, 50],
'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
'p': [2, 4],
'leaf_size': [15, 30],
}]
grid_search(clf, tuned_parameters, matrix, labels, verbose)
if 'sgd' in algorithms:
clf = skl.linear_model.SGDClassifier()
tuned_parameters = [{
'loss': ['hinge', 'log', 'modified_huber', 'perceptron'],
'penalty': ['none', 'l2', 'l1', 'elasticnet'],
'alpha': [0.01, 0.001, 0.0001, 0.00001, 0.000001],
'l1_ratio': [0.01, 0.15, 0.1, 0.2, 0.5, 0.9],
'n_iter': [1, 5, 10, 20],
'power_t': [0.1, 0.5, 0.9, 1.5, 2],
}]
grid_search(clf, tuned_parameters, matrix, labels, verbose)
if 'ada_rf' in algorithms:
rf = skl.ensemble.RandomForestClassifier(n_jobs=jobs)
ada_rf = sklearn.ensemble.AdaBoostClassifier(rf, random_state=42)
tuned_parameters = [{
"base_estimator__n_estimators": [16, 32],
"base_estimator__max_depth": [8, 16],
"n_estimators": [50, 100]
}]
grid_search(ada_rf, tuned_parameters, matrix, labels, verbose)
if 'rbf_svm' in algorithms:
# Warning: No clue if this works.
# SVC training time is quadratic with number of samples.
# Trying to use bagging classifier to break up sample sets.
svc_base = skl.svm.SVC(
verbose=verbose,
random_state=42,
class_weight='balanced'
)
n_estimators = 1000
clf = skl.ensemble.BaggingClassifier(
svc_base,
max_samples=1.0 / n_estimators,
n_estimators=n_estimators,
)
tuned_parameters = [{
'base_estimator__kernel': ['rbf', 'sigmoid'],
'base_estimator__probability': [True, False],
'base_estimator__decision_function_shape': ['ovo', 'ovr'],
# 'base_estimator__class_weight': [None, 'balanced'],
}]
grid_search(clf, tuned_parameters, matrix, labels, verbose)
if 'rf' in algorithms:
clf = sklearn.ensemble.RandomForestClassifier(
verbose=verbose, random_state=42, n_jobs=jobs
)
tuned_parameters = [{
'n_estimators': [50, 150, 300],
'criterion': ["gini", "entropy"],
'max_features': ["sqrt", "log2", 0.1, 0.3, None],
'max_depth': [None, 10, 100],
'min_samples_split': [2, 4, 8],
'oob_score': [True, False],
}]
grid_search(clf, tuned_parameters, matrix, labels, verbose)
if 'erf' in algorithms:
clf = sklearn.ensemble.ExtraTreesClassifier(
n_jobs=jobs, random_state=42, verbose=verbose,
oob_score=True, bootstrap=True,
)
tuned_parameters = [{
'n_estimators': [50, 150, 300],
'criterion': ["gini", "entropy"],
'max_features': ["sqrt", "log2", 0.1, 0.3, None],
'max_depth': [None, 10, 100],
'min_samples_split': [2, 4, 8],
}]
grid_search(clf, tuned_parameters, matrix, labels, verbose)
def grid_search(classifier, tuned_parameters, matrix, labels, verbose):
gs = skl.model_selection.GridSearchCV(classifier, tuned_parameters, cv=3, pre_dispatch=1, verbose=verbose)
type_name = type(classifier).__name__
print("Preparing to grid search {}".format(type_name))
gs.fit(matrix, labels)
print("{} CV results: {}".format(type_name, gs.cv_results_))
print("{} best parameters: {}".format(type_name, gs.best_params_))
if __name__ == '__main__':
argh.dispatch_command(main)
# Feature matrix normalization and simple feature selection
# Normalization isn't really useful for decision trees (e.g. RandomForest) but important for PCA and VarianceThreshold
def normalize_matrix(matrix, same_in_percent=0.9999):
# Depending on your data, you may want to normalize. Need to experiment to find out.
#log.info("Normalizing")
#normalize = sklearn.preprocessing.Normalizer(copy=False)
#matrix = normalize.fit_transform(matrix)
log.info("Scaling")
# If you have a dense matrix, you can use RobustScaler
# scale = skl.preprocessing.RobustScaler(copy=False)
# If you have a sparse matrix, need to use with_mean=False
# but StandardScaler is probably not good for sparse matrix with a bunch of 0s.
# https://stackoverflow.com/questions/30918781/right-function-for-normalizing-input-of-sklearn-svm
#scale = skl.preprocessing.StandardScaler(copy=False, with_std=True, with_mean=False)
#matrix = scale.fit_transform(matrix)
# Using Max scaling because I have a sparse matrix with many 0s
# If you have a dense matrix, MinMaxScaler() may be a better option.
maxabs = sklearn.preprocessing.MaxAbsScaler(copy=False)
matrix = maxabs.fit_transform(matrix)
# Remove features that are the same in, say, 85% of samples
log.info("Variance selection: {}".format(same_in_percent))
variance = skl.feature_selection.VarianceThreshold(threshold=(same_in_percent * (1 - same_in_percent)))
matrix = variance.fit_transform(matrix)
log.info("New matrix shape: {}".format(matrix.shape))
return matrix
# Recursive feature elimination with cross validation
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html#sklearn.feature_selection.RFECV
# Decision trees don't have coef_ for some strange reason.
class RandomForestClassifierWithCoef(sklearn.ensemble.RandomForestClassifier):
def fit(self, *args, **kwargs):
super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
self.coef_ = self.feature_importances_
# Instead of this, could use some other well-performing model with coef_
clf = RandomForestClassifierWithCoef(n_jobs=16)
# My feature matrix has 1-2 million dimensions, can't use a tiny step size
# For every recursion, drop 0.1% of worse features
# Min feature size is 0.1% of total features (if CV shows that's the best)
# This takes two days to run, if yours finishes in a hour, bump cv up to 3-10 and try step=1
rfecv = sklearn.feature_selection.RFECV(
estimator=clf,
step=0.001, cv=2, scoring='roc_auc',
verbose=2, n_jobs=16
)
X = rfecv.fit_transform(matrix, labels)
log.info("Selected matrix shape: {}".format(X.shape))
# Univariate function scoring
# Test different univariate feature scoring functions
# http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
for score_func in (
sklearn.feature_selection.mutual_info_classif,
sklearn.feature_selection.f_classif,
# only use chi2 if features are positive
#sklearn.feature_selection.chi2,
):
try:
score_some_func(matrix, labels, score_func)
except Exception as e:
log.error("Failed scoring {}: {}".format(score_func.__name__, e))
def score_some_func(matrix, labels, score_func, target_ks=[100, 1000]):
log.info("Score function = {}".format(score_func))
univar_select = sklearn.feature_selection.SelectKBest(score_func)
univar_select.fit(matrix, labels)
for target_k in target_ks:
log.info("Target k = {}".format(target_k))
out_path = 'univar_feat_select_{}_target_k_{}'.format(score_func.__name__, target_k)
univar_select.k = target_k
X = univar_select.transform(matrix)
univar_out_path = '{}_selctor'.format(out_path)
# Save this to load later, sort scores_, and plot to get idea of how many features are useful
# There's often a long tail of useless features
sklearn.externals.joblib.dump(univar_select, univar_out_path)
log.info('Performing cross validation')
# Any classifier would work here, just want something fast
clf = sklearn.ensemble.ExtraTreesClassifier(
n_estimators=32, n_jobs=-1, random_state=42, verbose=True
)
predicted = sklearn.model_selection.cross_val_predict(clf, X, labels, verbose=True, cv=10)
pred_out_path = '{}_predicted.pkl'.format(out_path)
sklearn.externals.joblib.dump(predicted, pred_out_path)
clf_out_path = '{}_clf.pkl'.format(out_path)
sklearn.externals.joblib.dump(clf, clf_out_path)
log.info(" recall: {}".format(sklearn.metrics.recall_score(labels, predicted)))
log.info(" precision: {}".format(sklearn.metrics.precision_score(labels, predicted)))
log.info(" accuracy: {}".format(sklearn.metrics.accuracy_score(labels, predicted)))
@CalebFenton
Copy link
Author

CalebFenton commented Aug 4, 2017

The way this works is you normalize your matrix and run it through these to get some idea of how many features are important. Then you combine the selected features from a few of them with http://scikit-learn.org/stable/auto_examples/feature_stacker.html The feature selection helps drop useless features, improve training time and possibly prediction time, and helps generalize the model (by dropping odd-ball features which can lead to over fitting).

I'm experimenting with this now for my Android models. Going to combine top 1-2k features from f_classif and mutual information and all features from RFE. That should make the matrix small enough to experiment with KernelPCA and grid searching. Right now it takes two weeks to do a meaningful grid search on a 32 core, 64 gig ram machine.

If k=100 or k=1000 perform really badly, maybe try k=2000 or 3000. If k=100 performs the same as k=1000, just use k=100, etc.

Two things not here which might be useful are some kind of PCA to maximize variance and feature agglomeration. I don't use these two until I'm really fighting for gains because you lose feature names and have no feedback for feature engineering. Both are kind of hard to use with high rank feature matricies because you can run out of memory. Interestingly, I was able to to use feature agglomeration to take ~30k features of our PE detection model (down from ~1 million features with f_classif feature selection and variance threshold) and cluster them into 100 features without much loss in model performance -- 2-3%. Freaking crazy that 2 million super diverse files, about half malware, can be fairly accurately judged with only 100 features.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment