Created
February 20, 2018 09:27
-
-
Save anonymous/328e53cba8c1024e579aca18d22ab0b5 to your computer and use it in GitHub Desktop.
Honeypot classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Honeypot classifier. https://www.kaggle.com/mrklees/applying-keras-scikit-learn-to-titanic""" | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
from keras.utils import to_categorical | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout | |
from sklearn.model_selection import train_test_split | |
from random import randint | |
from sklearn.neighbors import KNeighborsRegressor | |
from sklearn.neural_network import MLPClassifier | |
from sklearn.ensemble import VotingClassifier, BaggingClassifier | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.preprocessing import LabelBinarizer, StandardScaler | |
FILENAME = '../data/honeypot_dataset.csv' | |
def encode_one_hot(df, column, axis=1): | |
""" | |
:param df: | |
:param column: | |
:param axis: | |
:return: | |
""" | |
return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis) | |
class HoneypotData(object): | |
"""Honeypot Data | |
This class will contain the entire data pipeline from raw data to prepared | |
numpy arrays. It's eventually inherited by the model class, but is left | |
distinct for readbility and logical organization. | |
""" | |
filepath = '../data/' | |
train_fn = 'honeypot_dataset.csv' | |
test_fn = 'honeypot_dataset_test.csv' | |
def __init__(self): | |
""" Initializes and process all pipeline.""" | |
self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc() | |
def import_and_split_data(self): | |
"""Import that data and then split it into train/test sets. Make sure to stratify. | |
This stratify parameter makes a split so that the proportion of values in the sample produced will be the same | |
as the proportion of values provided to parameter stratify. | |
For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros | |
and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's. | |
""" | |
X, y = self.select_features(pd.read_csv(self.filepath + self.train_fn)) | |
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=606, stratify=y) | |
return X_train, y_train, X_valid, y_valid | |
def select_features(self, data): | |
"""Selects the features that we'll use in the model. Drops unused features""" | |
features = ['ruri', | |
'ruri_user', | |
'ruri_domain', | |
'from_user', | |
'from_domain', | |
'from_tag', | |
'to_user', | |
'contact_user', | |
'callid', | |
'content_type', | |
'user_agent', | |
'source_ip', | |
'source_port', | |
'destination_port', | |
'contact_ip', | |
'contact_port'] | |
dropped_features = ['ruri', 'ruri_domain', 'callid', 'from_tag', 'content_type', 'from_user'] | |
target = ['toll_fraud'] | |
X = data[features].drop(dropped_features, axis=1) | |
y = data[target] | |
return X, y | |
def fix_na(self, data): | |
"""Fill na's with test (in the case of contact_user), and with application/sdp in the case of content_type.""" | |
na_vars = {"contact_user": "test", "content_type": "application/sdp"} | |
return data.fillna(na_vars) | |
def encode_one_hot(self, df, column, axis=1): | |
return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis) | |
def create_dummies(self, data, cat_vars, cat_types): | |
"""Processes categorical data into dummy vars.""" | |
cat_data = data[cat_vars].values | |
for i in range(len(cat_vars)): | |
bins = LabelBinarizer().fit_transform(cat_data[:, 0].astype(cat_types[i])) | |
cat_data = np.delete(cat_data, 0, axis=1) | |
cat_data = np.column_stack((cat_data, bins)) | |
return cat_data | |
def preproc(self): | |
"""Executes the full preprocessing pipeline.""" | |
# Import Data & Split. | |
X_train_, y_train, X_valid_, y_valid = self.import_and_split_data() | |
# Fill NAs. | |
X_train, X_valid = self.fix_na(X_train_), self.fix_na(X_valid_) | |
# Preproc Categorical Vars | |
cat_vars = ['ruri_user', | |
'from_domain', | |
'to_user', | |
'contact_user', | |
'user_agent', | |
'source_ip', | |
'contact_ip'] | |
cat_types = ['str', 'str', 'str', 'str', 'str', 'str', 'str'] | |
print 'Before create_dummies' | |
print X_train.shape[0], X_train.shape[1] | |
print X_valid.shape[0], X_valid.shape[1] | |
X_train_cat, X_valid_cat = self.create_dummies(X_train, cat_vars, cat_types), self.create_dummies(X_valid, | |
cat_vars, | |
cat_types) | |
print 'After create_dummies' | |
print X_train_cat.shape[0], X_train_cat.shape[1] | |
print X_valid_cat.shape[0], X_valid_cat.shape[1] | |
X_train, X_valid = X_train_cat, X_valid_cat | |
print 'After assignment' | |
print X_train.shape[0], X_train.shape[1] | |
print X_valid.shape[0], X_valid.shape[1] | |
return X_train.astype('float32'), y_train.values, X_valid.astype('float32'), y_valid.values | |
def preproc_test(self): | |
test = pd.read_csv(self.filepath + self.test_fn) | |
labels = test.toll_fraud.values | |
test = self.fix_na(test) | |
# Preproc Categorical Vars | |
cat_vars = ['ruri_user', | |
'from_user', | |
'from_domain', | |
'to_user', | |
'contact_user', | |
'user_agent', | |
'source_ip', | |
'contact_ip'] | |
cat_types = ['str', 'str', 'str', 'str', 'str', 'str', 'str', 'str'] | |
test_cat = self.create_dummies(test, cat_vars, cat_types) | |
# Recombine | |
test = test_cat | |
return labels, test | |
class HoneypotKeras(HoneypotData): | |
def __init__(self): | |
self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc() | |
self.y_train, self.y_valid = to_categorical(self.y_train), to_categorical(self.y_valid) | |
print 'To categorical' | |
print(self.X_train.shape[0], self.X_train.shape[1]) | |
print(self.y_train.shape[0], self.y_train.shape[1]) | |
print(self.X_valid.shape[0], self.X_valid.shape[1]) | |
print(self.y_valid.shape[0], self.y_valid.shape[1]) | |
self.feature_count = self.X_train.shape[1] | |
self.history = [] | |
def build_model(self): | |
model = Sequential() | |
model.add(Dense(2056, input_shape=(self.feature_count,), activation='relu')) | |
model.add(Dropout(0.1)) | |
model.add(Dense(1028, activation='relu')) | |
model.add(Dropout(0.2)) | |
model.add(Dense(1028, activation='relu')) | |
model.add(Dropout(0.3)) | |
model.add(Dense(512, activation='relu')) | |
model.add(Dropout(0.4)) | |
model.add(Dense(2, activation='sigmoid')) | |
model.compile(optimizer='adam', | |
loss='binary_crossentropy', | |
metrics=['accuracy']) | |
self.model = model | |
def fit(self, lr=0.001, epochs=1): | |
self.model.optimizer.lr = lr | |
print('Training data:') | |
print(self.X_train.shape[0], self.X_train.shape[1]) | |
print(self.y_train.shape[0], self.y_train.shape[1]) | |
print(self.X_valid.shape[0], self.X_valid.shape[1]) | |
print(self.y_valid.shape[0], self.y_valid.shape[1]) | |
hist = self.model.fit(self.X_train, self.y_train, | |
batch_size=32, epochs=epochs, | |
verbose=1, validation_data=(self.X_valid, self.y_valid), | |
) | |
self.history.append(hist) | |
def prepare_submission(self, name): | |
labels, test_data = self.preproc_test() | |
predictions = self.model.predict(test_data) | |
print predictions | |
model = HoneypotKeras() | |
# model.build_model() | |
# model.fit(lr=0.01, epochs=5) | |
# model.fit(lr=0.001, epochs=10) | |
# model.prepare_submission('keras') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment