Last active
July 2, 2020 12:57
-
-
Save dtrizna/ee6bbd5f0089efc25b837e31115c2396 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.preprocessing import LabelEncoder | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from datetime import datetime | |
import numpy as np | |
MAX_TIMESTEPS = 128 | |
# number of features (except PID itself) | |
N = len(newdf.columns) - 1 | |
def groupby_transform(dataframe, column): | |
global MAX_TIMESTEPS, N | |
total = len(dataframe.groupby(column)) | |
start = datetime.now() | |
print(f"Started at: {start}") | |
print(f"Total categories of '{column}': '{total}'") | |
print(f"Unique values: {dataframe[column].nunique()}") | |
# Initialize Numpy Arrays with correct shape | |
train_X = np.empty(shape=(0, MAX_TIMESTEPS, N)).astype(np.int16) | |
train_y = np.empty(shape=(0,1)).astype(np.int16) | |
val_X = np.empty(shape=(0, MAX_TIMESTEPS, N)).astype(np.int16) | |
val_y = np.empty(shape=(0,1)).astype(np.int16) | |
# get encode object for string columns | |
binary_le = LabelEncoder().fit(['OTHER'] + list(dataframe['binary'].unique())) | |
path_le = LabelEncoder().fit(['OTHER'] + list(dataframe['path'].unique())) | |
# load malicious/valid process lists | |
with open(r'data/pid_valid.lst') as f: | |
valid_lst = [x.strip() for x in f.readlines()] | |
with open(r'data/pid_malicious.lst') as f: | |
mal_lst = [x.strip() for x in f.readlines()] | |
with open(r'data/validation_pid.lst') as f: | |
validation_data = f.readlines() | |
validation_pids = [] | |
for line in validation_data: | |
if ':' in line: | |
validation_pids.append(line.split(':')[0]) | |
try: | |
for i, (value, df) in enumerate(dataframe.groupby(column)): | |
# skip processes with less than 3 events | |
# - too little to identify malicious activity | |
if len(df) < 4: | |
continue | |
if value in valid_lst: | |
temp_y = np.array([0]).reshape(1,1) | |
elif value in mal_lst: | |
temp_y = np.array([1]).reshape(1,1) | |
else: | |
print(f'Unclassified ProcessID: {value}') | |
raise Exception | |
# Create 3D array from | |
temp_X = np.hstack(( | |
df[['EventID', 'unc_url', 'b64', 'network']].to_numpy(), | |
binary_le.transform(list(df['binary'])).reshape(-1,1), | |
path_le.transform(list(df['path'])).reshape(-1,1) | |
)) | |
# PADDING | |
temp_X = pad_sequences(temp_X.T, maxlen=MAX_TIMESTEPS).T | |
# adding this example to actual set | |
if value in validation_pids: | |
val_X = np.concatenate((val_X, temp_X.reshape(1, MAX_TIMESTEPS, N))) | |
val_y = np.concatenate((val_y, temp_y)) | |
else: | |
train_X = np.concatenate((train_X, temp_X.reshape(1, MAX_TIMESTEPS, N))) | |
train_y = np.concatenate((train_y, temp_y)) | |
end = datetime.now() | |
print(f"Ended at: {end}") | |
print(f"Script completion time: {end - start}") | |
return train_X, val_X, train_y, val_y | |
except KeyboardInterrupt: | |
end = datetime.now() | |
print(f"Ended at:\niteration:{i}\ntime:{end}") | |
print(f"Script completion time: {end - start}") | |
return train_X, val_X, train_y, val_y |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment