Skip to content

Instantly share code, notes, and snippets.

@alik604
Created November 24, 2019 01:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alik604/d9dd9e774c12da2fd4bf51a0ff3edbd5 to your computer and use it in GitHub Desktop.
Save alik604/d9dd9e774c12da2fd4bf51a0ff3edbd5 to your computer and use it in GitHub Desktop.
# load data and set labels. details admitted
train = pd.read_csv('https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv')
test = pd.read_csv('https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.csv')
train.columns , test.columns = labels , labels
combined_data = pd.concat([train, test]).drop('difficulty_level', 1)
le = LabelEncoder()
vector = combined_data['attack_type']
print("Attack Vectors:", set(list(vector))) # use print to make it print on single line
combined_data['attack_type'] = le.fit_transform(vector)
combined_data['protocol_type'] = le.fit_transform(combined_data['protocol_type'])
combined_data['service'] = le.fit_transform(combined_data['service'])
combined_data['flag'] = le.fit_transform(combined_data['flag'])
#train_test_split and normalize
data_x = combined_data.drop('attack_type', axis=1)
## I should have normalize(data_x) here....
data_y = combined_data.loc[:,['attack_type']]
# del combined_data # free mem
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.5, random_state=42) # TODO
X_train = pd.DataFrame(normalize(X_train))
X_test = pd.DataFrame(normalize(X_test))
DTC = DecisionTreeClassifier()
RFC = RandomForestClassifier(n_estimators=25, random_state=1)
ETC = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False)
x = X_train
y = y_train['attack_type'].ravel()
# predict data without feature selection
eclf = VotingClassifier(estimators=[('lr', DTC), ('rf', RFC),('et',ETC)], voting='hard')
RFC = RandomForestClassifier(n_estimators=25, random_state=1)
for clf, label in zip([DTC, RFC,ETC, eclf], ['DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier', 'Ensemble']):
_ = clf.fit(x,y)
pred = clf.score(X_test,y_test)
print("Acc: %0.10f [%s]" % (pred,label))
'''
wow! 99% already....
41 dimensions
Acc: 0.9907079467 [DecisionTreeClassifier]
Acc: 0.9934955627 [RandomForestClassifier]
Acc: 0.9922431555 [ExtraTreesClassifier]
Acc: 0.9935628964 [Ensemble]
'''
#recursive feature elimination, then Singular value decomposition
rfe = RFE(DTC, n).fit(x,y)
desiredIndices = np.where(rfe.support_==True)[0]
whitelist = X_train.columns.values[desiredIndices]
svd = TruncatedSVD(n_components=n-20)
_ = svd.fit(X_train[whitelist]) # or fit transform and omit part of the next line
X_train_svd, X_test_svd = svd.transform(X_train[whitelist]), svd.transform(X_test[whitelist])
# Bechmark post feature reduction
DTC = DecisionTreeClassifier()
RFC = RandomForestClassifier(n_estimators=25, random_state=1)
ETC = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_features='auto', bootstrap=False)
eclf = VotingClassifier(estimators=[('lr', DTC), ('rf', RFC),('et',ETC)], voting='hard')
for clf, label in zip([DTC, RFC,ETC, eclf], ['DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier', 'Ensemble']):
_ = eclf.fit(X_train_svd,y_train)
pred = eclf.score(X_test_svd,y_test)
print("Acc: %0.10f [%s]" % (pred,label))
'''
10 dimensions
Acc: 0.9857387182 [DecisionTreeClassifier]
Acc: 0.9863043215 [RandomForestClassifier]
Acc: 0.9860753868 [ExtraTreesClassifier]
Acc: 0.9861157871 [Ensemble] # 0.74% less accuracy with 25% of the dimensions.
75% less data gets us <1% less accuracy
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment