Echooff3/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Scikit-Learn - Gaussian Naive Bayes (File or Binary)

I wanted to create a quick tool to determine if a file is a text file or binary file. I'm not 100% sure this was
right approach. However, early testing is showing that it's working.
My assumption is that from the first 128 bytes of a file that you can dermine its type.
I'm going to throw more files at it and train it up some more.

  
## test.py
# https://scikit-learn.org/stable/modules/naive_bayes.html
# https://scikit-learn.org/stable/modules/model_persistence.html
import pickle
import sys
import numpy as np

MAX_BYTES = 128

with open('gnb.pickle', 'br') as f:
    gnb = pickle.load(f)

file_name = sys.argv[1]
with open(file_name, 'rb') as f:
    first_bytes = f.read(MAX_BYTES)
X = np.array([list(first_bytes)])

pred = gnb.predict(X)
classes = ["Text File", "Binary File"]

print(f'Prediction ({pred}) {classes[pred.item()]}')

## train.py
# https://scikit-learn.org/stable/modules/naive_bayes.html
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import glob
import os
import numpy as np
import pickle


dir_name = os.path.dirname(__file__)
text_files = glob.glob(os.path.join(dir_name, 'sample_data/text/') + '*.*')
binary_files = glob.glob(os.path.join(dir_name, 'sample_data/binary/') + '*.*')
X = []
Y = []
MAX_BYTES = 128
for file_name in text_files:
    with open(file_name, 'rb') as f:
        first_bytes = f.read(MAX_BYTES)
    X.append(list(first_bytes))
    Y.append(0)

for file_name in binary_files:
    with open(file_name, 'rb') as f:
        first_bytes = f.read(MAX_BYTES)
    X.append(list(first_bytes))
    Y.append(1)

X = np.array(X)
Y = np.array(Y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

with open('gnb.pickle', 'bw') as f:
    pickle.dump(gnb, f, pickle.HIGHEST_PROTOCOL)
	# https://scikit-learn.org/stable/modules/naive_bayes.html
	# https://scikit-learn.org/stable/modules/model_persistence.html
	import pickle
	import sys
	import numpy as np

	MAX_BYTES = 128

	with open('gnb.pickle', 'br') as f:
	gnb = pickle.load(f)

	file_name = sys.argv[1]
	with open(file_name, 'rb') as f:
	first_bytes = f.read(MAX_BYTES)
	X = np.array([list(first_bytes)])

	pred = gnb.predict(X)
	classes = ["Text File", "Binary File"]

	print(f'Prediction ({pred}) {classes[pred.item()]}')
	# https://scikit-learn.org/stable/modules/naive_bayes.html
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import GaussianNB
	import glob
	import os
	import numpy as np
	import pickle


	dir_name = os.path.dirname(__file__)
	text_files = glob.glob(os.path.join(dir_name, 'sample_data/text/') + '.')
	binary_files = glob.glob(os.path.join(dir_name, 'sample_data/binary/') + '.')
	X = []
	Y = []
	MAX_BYTES = 128
	for file_name in text_files:
	with open(file_name, 'rb') as f:
	first_bytes = f.read(MAX_BYTES)
	X.append(list(first_bytes))
	Y.append(0)

	for file_name in binary_files:
	with open(file_name, 'rb') as f:
	first_bytes = f.read(MAX_BYTES)
	X.append(list(first_bytes))
	Y.append(1)

	X = np.array(X)
	Y = np.array(Y)

	X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
	gnb = GaussianNB()

	y_pred = gnb.fit(X_train, y_train).predict(X_test)

	print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (y_test != y_pred).sum()))

	with open('gnb.pickle', 'bw') as f:
	pickle.dump(gnb, f, pickle.HIGHEST_PROTOCOL)