Last active
March 28, 2020 19:35
-
-
Save parvathysarat/fc537f26d3d70639fc80a9a43b7c0263 to your computer and use it in GitHub Desktop.
This Random Forest model was developed to detect fraud in the login access of a Firewall server. A two-way classification.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob,struct,os | |
import pandas as pd | |
import numpy as np | |
#names of the columns | |
names=["Timestamp","Customer ID","Host","Log file","Log sequence no.","Entry type","Entry identifier","User,if","Reporting IP/host","Source IP,if","Source port,if","Destination IP, if","Destination Port, if","Text field1","Text field2","Text field3","Numeric field1","Numeric field2"] | |
# defining path to the dataset folder | |
path=r'C:/Users/PARVATHY SARAT/Desktop/FIREWALL' | |
#accessing all CSV files (datasets or Firewall logs) in the folder | |
all_files=glob.glob(os.path.join(path,"*.csv")) | |
df_each=(pd.read_csv(f,header=None) for f in all_files) | |
#concatenating all the CSV files in the folder | |
result=pd.concat(df_each) | |
result.columns=names | |
arranged=result.iloc[:,[12,13]] | |
# ensuring 'arranged' is not a copy of the original 'result' dataset as | |
# it can lead to errors with certain functions and make it difficult | |
# to work with | |
arranged.is_copy=False | |
# Removing all the rows for which Output = "AUDIT" as | |
# they are redundant | |
arranged=arranged[arranged["Text field1"]!="AUDIT"] | |
arranged=arranged[arranged["Text field1"]!="AUDIT INVALID"] | |
features=arranged.columns[0] | |
arranged=arranged.dropna(axis=0) | |
train=np.array(arranged[features]) | |
train=train.reshape(-1,1) | |
#factorizing column to be predicted, saving it in 'label' | |
y,label = pd.factorize(arranged["Text field1"]) | |
from sklearn.ensemble import RandomForestClassifier | |
clf=RandomForestClassifier() | |
#fitting the Random Forest Classification model to the training data | |
clf.fit(train,y) | |
#import test file | |
test=pd.read_csv("file:///C:/Users/PARVATHY SARAT/Desktop/test.csv",names=names) | |
arranged_test=test.iloc[:,[12,13]] | |
# Applying same changes as to the training set | |
# Removing redundant rows | |
arranged_test=arranged_test[arranged_test["Text field1"]!="AUDIT"] | |
arranged_test=arranged_test[arranged_test["Text field1"]!="AUDIT INVALID"] | |
test=np.array(arranged_test[features]) | |
test=test.reshape(-1,1) | |
# Using the model to predict the Output (ie. fraud or not) on the test datset | |
pred=clf.predict(test) | |
pred_label=label[pred] | |
from sklearn.metrics import accuracy_score | |
#normalize=True yields fraction of correct answers | |
#normalize=False yields no. of correct answers | |
accuracy=accuracy_score(arranged_test["Text field1"],pred_label,normalize=True) | |
print (accuracy) | |
# 1.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment