Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@parvathysarat
Last active March 28, 2020 19:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save parvathysarat/fc537f26d3d70639fc80a9a43b7c0263 to your computer and use it in GitHub Desktop.
Save parvathysarat/fc537f26d3d70639fc80a9a43b7c0263 to your computer and use it in GitHub Desktop.
This Random Forest model was developed to detect fraud in the login access of a Firewall server. A two-way classification.
import glob,struct,os
import pandas as pd
import numpy as np
#names of the columns
names=["Timestamp","Customer ID","Host","Log file","Log sequence no.","Entry type","Entry identifier","User,if","Reporting IP/host","Source IP,if","Source port,if","Destination IP, if","Destination Port, if","Text field1","Text field2","Text field3","Numeric field1","Numeric field2"]
# defining path to the dataset folder
path=r'C:/Users/PARVATHY SARAT/Desktop/FIREWALL'
#accessing all CSV files (datasets or Firewall logs) in the folder
all_files=glob.glob(os.path.join(path,"*.csv"))
df_each=(pd.read_csv(f,header=None) for f in all_files)
#concatenating all the CSV files in the folder
result=pd.concat(df_each)
result.columns=names
arranged=result.iloc[:,[12,13]]
# ensuring 'arranged' is not a copy of the original 'result' dataset as
# it can lead to errors with certain functions and make it difficult
# to work with
arranged.is_copy=False
# Removing all the rows for which Output = "AUDIT" as
# they are redundant
arranged=arranged[arranged["Text field1"]!="AUDIT"]
arranged=arranged[arranged["Text field1"]!="AUDIT INVALID"]
features=arranged.columns[0]
arranged=arranged.dropna(axis=0)
train=np.array(arranged[features])
train=train.reshape(-1,1)
#factorizing column to be predicted, saving it in 'label'
y,label = pd.factorize(arranged["Text field1"])
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
#fitting the Random Forest Classification model to the training data
clf.fit(train,y)
#import test file
test=pd.read_csv("file:///C:/Users/PARVATHY SARAT/Desktop/test.csv",names=names)
arranged_test=test.iloc[:,[12,13]]
# Applying same changes as to the training set
# Removing redundant rows
arranged_test=arranged_test[arranged_test["Text field1"]!="AUDIT"]
arranged_test=arranged_test[arranged_test["Text field1"]!="AUDIT INVALID"]
test=np.array(arranged_test[features])
test=test.reshape(-1,1)
# Using the model to predict the Output (ie. fraud or not) on the test datset
pred=clf.predict(test)
pred_label=label[pred]
from sklearn.metrics import accuracy_score
#normalize=True yields fraction of correct answers
#normalize=False yields no. of correct answers
accuracy=accuracy_score(arranged_test["Text field1"],pred_label,normalize=True)
print (accuracy)
# 1.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment