Skip to content

Instantly share code, notes, and snippets.

@sagar03d
Created April 3, 2018 11:20
Show Gist options
  • Save sagar03d/0632f1a8ef9f491e2a0437f057cc78f9 to your computer and use it in GitHub Desktop.
Save sagar03d/0632f1a8ef9f491e2a0437f057cc78f9 to your computer and use it in GitHub Desktop.
data = pd.read_csv('data.csv', sep='|')
X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = data['legitimate'].values
print('Researching important feature based on %i total features\n' % X.shape[1])
# Feature selection using Trees Classifier
fsel = ske.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit=True)
X_new = model.transform(X)
nb_features = X_new.shape[1]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)
features = []
print('%i features identified as important:' % nb_features)
indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))
# XXX : take care of the feature order
for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
features.append(data.columns[2+f])
#Algorithm comparison
algorithms = {
"DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
"RandomForest": ske.RandomForestClassifier(n_estimators=50),
"GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
"AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
"GNB": GaussianNB()
}
results = {}
print("\nNow testing algorithms")
for algo in algorithms:
clf = algorithms[algo]
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("%s : %f %%" % (algo, score*100))
results[algo] = score
winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
# Save the algorithm and the feature list for later predictions
print('Saving algorithm and feature list in classifier directory...')
joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
open('classifier/features.pkl', 'w').write(pickle.dumps(features))
print('Saved')
# Identify false and true positive rates
clf = algorithms[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment